[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #170335)

Tony Linthicum via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 21 11:48:26 PST 2026


https://github.com/tlinthic updated https://github.com/llvm/llvm-project/pull/170335

>From 04380f60badeea0238d9bdd117d5e2f2fb6a0ddc Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 9 Dec 2025 08:49:29 -0600
Subject: [PATCH 01/35] Allow MachineBlockFrequencyInfo to conditionally be a
 required pass for MachineSchedulerPass via pass parameters.

---
 llvm/include/llvm/CodeGen/MachineScheduler.h   |  3 ++-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h  |  7 ++++---
 llvm/lib/CodeGen/MachineScheduler.cpp          | 11 +++++++----
 llvm/lib/CodeGen/MachineSink.cpp               |  1 +
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 +-
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 33036030679e5..24a5c12ee2300 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1459,9 +1459,10 @@ class MachineSchedulerPass : public PassInfoMixin<MachineSchedulerPass> {
   // analysis.
   std::unique_ptr<impl_detail::MachineSchedulerImpl> Impl;
   const TargetMachine *TM;
+  bool UseMBFI;
 
 public:
-  LLVM_ABI MachineSchedulerPass(const TargetMachine *TM);
+  LLVM_ABI MachineSchedulerPass(const TargetMachine *TM, bool UMBFI = false);
   LLVM_ABI MachineSchedulerPass(MachineSchedulerPass &&Other);
   LLVM_ABI ~MachineSchedulerPass();
   LLVM_ABI PreservedAnalyses run(MachineFunction &MF,
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 005b695741903..ee5ba27aaaa46 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -472,7 +472,8 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   /// addOptimizedRegAlloc - Add passes related to register allocation.
   /// CodeGenTargetMachineImpl provides standard regalloc passes for most
   /// targets.
-  void addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
+  void addOptimizedRegAlloc(PassManagerWrapper &PMW,
+                            bool RequireMBFI = false) const;
 
   /// Add passes that optimize machine instructions after register allocation.
   void addMachineLateOptimization(PassManagerWrapper &PMW) const;
@@ -1212,7 +1213,7 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::addFastRegAlloc(
 /// scheduling, and register allocation itself.
 template <typename Derived, typename TargetMachineT>
 void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
-    PassManagerWrapper &PMW) const {
+    PassManagerWrapper &PMW, bool RequireMBFI) const {
   addMachineFunctionPass(DetectDeadLanesPass(), PMW);
 
   addMachineFunctionPass(InitUndefPass(), PMW);
@@ -1252,7 +1253,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
   addMachineFunctionPass(RenameIndependentSubregsPass(), PMW);
 
   // PreRA instruction scheduling.
-  addMachineFunctionPass(MachineSchedulerPass(&TM), PMW);
+  addMachineFunctionPass(MachineSchedulerPass(&TM), PMW, RequireMBFI);
 
   if (auto E = derived().addRegAssignmentOptimized(PMW)) {
     // addRegAssignmentOptimized did not add a reg alloc pass, so do nothing.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index b44d96609c170..8f535b82466fe 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -382,9 +382,10 @@ namespace {
 /// MachineScheduler runs after coalescing and before register allocation.
 class MachineSchedulerLegacy : public MachineFunctionPass {
   MachineSchedulerImpl Impl;
+  bool UseMBFI;
 
 public:
-  MachineSchedulerLegacy();
+  MachineSchedulerLegacy(bool UMBFI = false);
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnMachineFunction(MachineFunction&) override;
 
@@ -420,7 +421,9 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass);
 INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
-MachineSchedulerLegacy::MachineSchedulerLegacy() : MachineFunctionPass(ID) {
+MachineSchedulerLegacy::MachineSchedulerLegacy(bool UMBFI)
+    : MachineFunctionPass(ID) {
+  UseMBFI = UMBFI;
   initializeMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry());
 }
 
@@ -669,8 +672,8 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
   return Impl.run(MF, TM, {MLI, MDT, AA, LIS, MBFI});
 }
 
-MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM)
-    : Impl(std::make_unique<MachineSchedulerImpl>()), TM(TM) {}
+MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM, bool UMBFI)
+    : Impl(std::make_unique<MachineSchedulerImpl>()), TM(TM), UseMBFI(UMBFI) {}
 MachineSchedulerPass::~MachineSchedulerPass() = default;
 MachineSchedulerPass::MachineSchedulerPass(MachineSchedulerPass &&Other) =
     default;
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 5d46111bb0b14..1af56ac227acc 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -308,6 +308,7 @@ class MachineSinkingLegacy : public MachineFunctionPass {
     AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addPreserved<MachineCycleInfoWrapperPass>();
     AU.addPreserved<MachineLoopInfoWrapperPass>();
+    AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     if (UseBlockFreqInfo) {
       AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d2a2f81255344..837a74b2923ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2344,7 +2344,7 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
     insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass());
 
-  Base::addOptimizedRegAlloc(PMW);
+  Base::addOptimizedRegAlloc(PMW, /* ReqireMBFI */ true);
 }
 
 void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {

>From 6e8bb05486a9441f8fd68d8bc4c8e4a41a0fd710 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 7 Jan 2026 14:37:38 -0600
Subject: [PATCH 02/35] Remove changes to AArch64 test.  It should no longer
 fail with conditional dependence on MachineBlockFrequencyInfo

---
 llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index ff50a3caedb14..d137b8c9ac1e0 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -178,6 +178,7 @@
 ; CHECK-NEXT:       Rename Disconnected Subregister Components
 ; CHECK-NEXT:       Machine Instruction Scheduler
 ; CHECK-NEXT:       AArch64 Post Coalescer pass
+; CHECK-NEXT:       Machine Block Frequency Analysis
 ; CHECK-NEXT:       Debug Variable Analysis
 ; CHECK-NEXT:       Live Stack Slot Analysis
 ; CHECK-NEXT:       Virtual Register Map

>From db5a4547b69b834a57444eeff60df26e91be8347 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Thu, 8 Jan 2026 10:43:04 -0600
Subject: [PATCH 03/35] Fix pass parameter for MachineScheduler with new pass
 manager and add flag to conditionally enable MBFI for legacy pass manager

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h  |  2 +-
 llvm/lib/CodeGen/MachineScheduler.cpp          | 10 ++++++----
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 ++
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index ee5ba27aaaa46..b66ec2c6c4a2c 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -1253,7 +1253,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
   addMachineFunctionPass(RenameIndependentSubregsPass(), PMW);
 
   // PreRA instruction scheduling.
-  addMachineFunctionPass(MachineSchedulerPass(&TM), PMW, RequireMBFI);
+  addMachineFunctionPass(MachineSchedulerPass(&TM, RequireMBFI), PMW);
 
   if (auto E = derived().addRegAssignmentOptimized(PMW)) {
     // addRegAssignmentOptimized did not add a reg alloc pass, so do nothing.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 8f535b82466fe..d01cadebdd554 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -208,6 +208,10 @@ cl::opt<bool> llvm::VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
+cl::opt<bool> llvm::RequireMBFILegacySched(
+    "require-mbfi-legacy-sched", cl::Hidden,
+    cl::desc("Require MachineBlocFrequencyInfo for legacy scheduling pass"));
+
 #ifndef NDEBUG
 cl::opt<bool> llvm::ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
@@ -382,10 +386,9 @@ namespace {
 /// MachineScheduler runs after coalescing and before register allocation.
 class MachineSchedulerLegacy : public MachineFunctionPass {
   MachineSchedulerImpl Impl;
-  bool UseMBFI;
 
 public:
-  MachineSchedulerLegacy(bool UMBFI = false);
+  MachineSchedulerLegacy();
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnMachineFunction(MachineFunction&) override;
 
@@ -421,9 +424,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass);
 INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
-MachineSchedulerLegacy::MachineSchedulerLegacy(bool UMBFI)
+MachineSchedulerLegacy::MachineSchedulerLegacy()
     : MachineFunctionPass(ID) {
-  UseMBFI = UMBFI;
   initializeMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry());
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 837a74b2923ac..f9e7453f23bb7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1627,6 +1627,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
+  // Have the pre-RA machine scheduler require MachineBlockFrequencyInfo.
+  RequireMBFILegacySched = true;
   TargetPassConfig::addOptimizedRegAlloc();
 }
 

>From 95675e829931ee60075d9493b6981b17309241e1 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 12 Jan 2026 09:02:13 -0600
Subject: [PATCH 04/35] Go back to unconditional dependence of MachineScheduler
 on MBFI

---
 llvm/include/llvm/CodeGen/MachineScheduler.h   |  3 +--
 llvm/include/llvm/Passes/CodeGenPassBuilder.h  |  7 +++----
 llvm/lib/CodeGen/MachineScheduler.cpp          | 11 ++++-------
 llvm/lib/CodeGen/MachineSink.cpp               |  1 -
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  4 +---
 llvm/test/CodeGen/AArch64/O3-pipeline.ll       |  1 -
 6 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 24a5c12ee2300..33036030679e5 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1459,10 +1459,9 @@ class MachineSchedulerPass : public PassInfoMixin<MachineSchedulerPass> {
   // analysis.
   std::unique_ptr<impl_detail::MachineSchedulerImpl> Impl;
   const TargetMachine *TM;
-  bool UseMBFI;
 
 public:
-  LLVM_ABI MachineSchedulerPass(const TargetMachine *TM, bool UMBFI = false);
+  LLVM_ABI MachineSchedulerPass(const TargetMachine *TM);
   LLVM_ABI MachineSchedulerPass(MachineSchedulerPass &&Other);
   LLVM_ABI ~MachineSchedulerPass();
   LLVM_ABI PreservedAnalyses run(MachineFunction &MF,
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index b66ec2c6c4a2c..005b695741903 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -472,8 +472,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   /// addOptimizedRegAlloc - Add passes related to register allocation.
   /// CodeGenTargetMachineImpl provides standard regalloc passes for most
   /// targets.
-  void addOptimizedRegAlloc(PassManagerWrapper &PMW,
-                            bool RequireMBFI = false) const;
+  void addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
 
   /// Add passes that optimize machine instructions after register allocation.
   void addMachineLateOptimization(PassManagerWrapper &PMW) const;
@@ -1213,7 +1212,7 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::addFastRegAlloc(
 /// scheduling, and register allocation itself.
 template <typename Derived, typename TargetMachineT>
 void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
-    PassManagerWrapper &PMW, bool RequireMBFI) const {
+    PassManagerWrapper &PMW) const {
   addMachineFunctionPass(DetectDeadLanesPass(), PMW);
 
   addMachineFunctionPass(InitUndefPass(), PMW);
@@ -1253,7 +1252,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
   addMachineFunctionPass(RenameIndependentSubregsPass(), PMW);
 
   // PreRA instruction scheduling.
-  addMachineFunctionPass(MachineSchedulerPass(&TM, RequireMBFI), PMW);
+  addMachineFunctionPass(MachineSchedulerPass(&TM), PMW);
 
   if (auto E = derived().addRegAssignmentOptimized(PMW)) {
     // addRegAssignmentOptimized did not add a reg alloc pass, so do nothing.
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index d01cadebdd554..9c369a14696a1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -208,10 +208,6 @@ cl::opt<bool> llvm::VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
-cl::opt<bool> llvm::RequireMBFILegacySched(
-    "require-mbfi-legacy-sched", cl::Hidden,
-    cl::desc("Require MachineBlocFrequencyInfo for legacy scheduling pass"));
-
 #ifndef NDEBUG
 cl::opt<bool> llvm::ViewMISchedDAGs(
     "view-misched-dags", cl::Hidden,
@@ -674,8 +670,8 @@ bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
   return Impl.run(MF, TM, {MLI, MDT, AA, LIS, MBFI});
 }
 
-MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM, bool UMBFI)
-    : Impl(std::make_unique<MachineSchedulerImpl>()), TM(TM), UseMBFI(UMBFI) {}
+MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM)
+    : Impl(std::make_unique<MachineSchedulerImpl>()), TM(TM) {}
 MachineSchedulerPass::~MachineSchedulerPass() = default;
 MachineSchedulerPass::MachineSchedulerPass(MachineSchedulerPass &&Other) =
     default;
@@ -713,7 +709,8 @@ MachineSchedulerPass::run(MachineFunction &MF,
   return getMachineFunctionPassPreservedAnalyses()
       .preserveSet<CFGAnalyses>()
       .preserve<SlotIndexesAnalysis>()
-      .preserve<LiveIntervalsAnalysis>();
+      .preserve<LiveIntervalsAnalysis>()
+      .preserve<MachineBlockFrequencyAnalysis>();
 }
 
 bool PostMachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 1af56ac227acc..5d46111bb0b14 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -308,7 +308,6 @@ class MachineSinkingLegacy : public MachineFunctionPass {
     AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
     AU.addPreserved<MachineCycleInfoWrapperPass>();
     AU.addPreserved<MachineLoopInfoWrapperPass>();
-    AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     if (UseBlockFreqInfo) {
       AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f9e7453f23bb7..d2a2f81255344 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1627,8 +1627,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOptLevel::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
-  // Have the pre-RA machine scheduler require MachineBlockFrequencyInfo.
-  RequireMBFILegacySched = true;
   TargetPassConfig::addOptimizedRegAlloc();
 }
 
@@ -2346,7 +2344,7 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
     insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass());
 
-  Base::addOptimizedRegAlloc(PMW, /* ReqireMBFI */ true);
+  Base::addOptimizedRegAlloc(PMW);
 }
 
 void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index d137b8c9ac1e0..ff50a3caedb14 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -178,7 +178,6 @@
 ; CHECK-NEXT:       Rename Disconnected Subregister Components
 ; CHECK-NEXT:       Machine Instruction Scheduler
 ; CHECK-NEXT:       AArch64 Post Coalescer pass
-; CHECK-NEXT:       Machine Block Frequency Analysis
 ; CHECK-NEXT:       Debug Variable Analysis
 ; CHECK-NEXT:       Live Stack Slot Analysis
 ; CHECK-NEXT:       Virtual Register Map

>From d93f07ad5e99044f8416bd6446cf4d0285b7a9b8 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 14 Jan 2026 06:34:02 -0600
Subject: [PATCH 05/35] Have PHI elimination fix MBFI when splitting edges

---
 llvm/lib/CodeGen/PHIElimination.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 1c054cca72cde..d77e87f058f4c 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -224,6 +224,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
+  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
+  AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }

>From 868706f4085c0f2648e015b01bdc178293d317db Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 14 Jan 2026 08:59:17 -0600
Subject: [PATCH 06/35] Fix minor changes in the pipeline.  The differences are
 a different pass order rather than new pass runs.

---
 llvm/lib/CodeGen/PHIElimination.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index d77e87f058f4c..e6a5183cf528c 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -129,7 +129,7 @@ class PHIEliminationImpl {
     auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
     auto *MDTWrapper =
         P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
-    auto *PDTWrapper =
+    auto *PDTWrapper = 
         P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
     auto *MBPIWrapper =
         P->getAnalysisIfAvailable<MachineBranchProbabilityInfoWrapperPass>();
@@ -224,8 +224,6 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
-  AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }

>From 4efac3567a8e06a8ed9fab38df77d5117ae50ff7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 15 Jul 2025 15:10:41 -0700
Subject: [PATCH 07/35] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
 to AGPR

Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
---
 .../llvm/CodeGen/MachineInstrBuilder.h        |   15 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |   30 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  641 ++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |   70 +-
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 5591 +++++++++++++++++
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  524 ++
 6 files changed, 6866 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index eb9bcfb7c01a3..7f389952bd765 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -516,6 +516,21 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
   return MachineInstrBuilder(MF, MI).copyMIMetadata(MIMD);
 }
 
+/// This version of the builder inserts the newly-built instruction after the
+/// given position in the given MachineBasicBlock, and does NOT take a
+/// destination register.
+inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
+                                        MachineBasicBlock::iterator I,
+                                        const MIMetadata &MIMD,
+                                        const MCInstrDesc &MCID) {
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
+  BB.insertAfter(I, MI);
+  return MachineInstrBuilder(MF, MI)
+      .setPCSections(MIMD.getPCSections())
+      .setMMRAMetadata(MIMD.getMMRAMetadata());
+}
+
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::instr_iterator I,
                                    const MIMetadata &MIMD,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 15853a35d230e..a87e1d984a626 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -102,6 +102,36 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
+  unsigned getVGPRSpills(const GCNSubtarget &ST, MachineFunction &MF) {
+    if (!ST.hasGFX90AInsts())
+      return 0;
+
+    auto MaxVectorRegs = ST.getMaxNumVectorRegs(MF.getFunction());
+    unsigned ArchVGPRThreshold = MaxVectorRegs.first;
+    unsigned AGPRThreshold = MaxVectorRegs.second;
+
+    unsigned ArchPressure = getArchVGPRNum();
+    unsigned AGPRPressure = getAGPRNum();
+
+    unsigned ArchSpill = ArchPressure > ArchVGPRThreshold
+                             ? (ArchPressure - ArchVGPRThreshold)
+                             : 0;
+    unsigned AGPRSpill =
+        AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
+
+    unsigned UnifiedSpill = 0;
+
+    if (ST.hasGFX90AInsts()) {
+      unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+      unsigned UnifiedPressure = getVGPRNum(true);
+      UnifiedSpill = UnifiedPressure > CombinedThreshold
+                         ? (UnifiedPressure - CombinedThreshold)
+                         : 0;
+    }
+
+    return std::max(UnifiedSpill, (ArchSpill + AGPRSpill));
+  }
+
   void inc(unsigned Reg,
            LaneBitmask PrevMask,
            LaneBitmask NewMask,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index cabf759762a72..1ad31fd455449 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -690,6 +691,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+  SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -946,6 +948,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::RewriteSchedule:
+    return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -1183,6 +1187,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
+  case GCNSchedStageID::RewriteSchedule:
+    OS << "Instruction Rewriting Reschedule";
+    break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     OS << "Unclustered High Register Pressure Reschedule";
     break;
@@ -1216,6 +1223,112 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
+SlotIndex
+RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
+                                       LiveIntervals *LIS,
+                                       SmallVectorImpl<SlotIndex> &DefIdxs) {
+  assert(UseMO.isReg());
+  MachineInstr *UseMI = UseMO.getParent();
+  LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
+  auto VNInfo = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+
+  SlotIndex DefMBBStart =
+      LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNInfo->def));
+
+  // If the def is in the block, then it must be the only reaching def.
+  if (DefMBBStart != VNInfo->def) {
+    DefIdxs.push_back(VNInfo->def);
+    return VNInfo->def;
+  }
+
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<MachineBasicBlock *, 8> Worklist;
+
+  Visited.insert(UseMI->getParent());
+
+  // Mark the predecessor blocks for traversal
+  for (auto PredMBB : UseMI->getParent()->predecessors()) {
+    Worklist.push_back(PredMBB);
+    Visited.insert(PredMBB);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
+
+    SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
+    auto VNInfo = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+
+    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNInfo->def);
+    SlotIndex DefMBBStart = LIS->getMBBStartIdx(DefMBB);
+
+    // If there is a def in this block, then add it to the list. This is the
+    // reaching def of this path.
+    if (DefMBBStart != VNInfo->def) {
+      DefIdxs.push_back(VNInfo->def);
+      continue;
+    }
+
+    for (auto PredMBB : DefMBB->predecessors()) {
+      if (Visited.insert(PredMBB).second)
+        Worklist.push_back(PredMBB);
+    }
+  }
+
+  return VNInfo->def;
+}
+
+void RewriteScheduleStage::findReachingUses(
+    MachineInstr *DefMI, LiveIntervals *LIS,
+    SmallVectorImpl<MachineOperand *> &ReachingUses) {
+  SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
+  for (auto &UseMO :
+       DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
+    SmallVector<SlotIndex, 8> ReachingDefIndexes;
+    findReachingDefs(UseMO, LIS, ReachingDefIndexes);
+
+    // If we find a use that contains this DefMI in its reachingDefs, then it is
+    // a reaching use.
+    if (find_if(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+          return SlotIndex::isSameInstr(RDIdx, DefIdx);
+        }) != ReachingDefIndexes.end())
+      ReachingUses.push_back(&UseMO);
+  }
+}
+
+bool RewriteScheduleStage::initGCNSchedStage() {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
+  RegionsWithExcessArchVGPR.reset();
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    auto PressureBefore = DAG.Pressure[Region];
+    if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+      RegionsWithExcessArchVGPR[Region] = true;
+  }
+
+  if (!ST.hasGFX90AInsts() || RegionsWithExcessArchVGPR.none())
+    return false;
+
+  TII = ST.getInstrInfo();
+  SRI = ST.getRegisterInfo();
+
+  std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
+  DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
+  SmallPtrSet<MachineInstr *, 8> CopyForDef;
+
+  if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
+    return false;
+
+  int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
+
+  // If we haven't found the beneficial conditions, prefer the VGPR form which
+  // may result in less cross RC copies.
+  if (Cost > 0)
+    return false;
+
+  return rewrite(RewriteCands);
+}
+
 bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (DisableUnclusterHighRP)
     return false;
@@ -1837,6 +1950,534 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+  if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+    return false;
+  return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteScheduleStage::initHeuristics(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  // Prepare for the heuristics
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (isRewriteCandidate(&MI)) {
+        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+        if (ReplacementOp == -1)
+          continue;
+
+        RewriteCands.push_back({&MI, MI.getOpcode()});
+        MI.setDesc(TII->get(ReplacementOp));
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (Src2->isReg()) {
+          SmallVector<SlotIndex, 8> Src2ReachingDefs;
+          findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+          // For any definition of the src2 register which is non-MFMA, we
+          // insert a copy.
+          for (SlotIndex RDIdx : Src2ReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+            if (!TII->isMAI(*RD))
+              CopyForDef.insert(RD);
+          }
+        }
+
+        MachineOperand &Dst = MI.getOperand(0);
+        SmallVector<MachineOperand *, 8> DstReachingUses;
+
+        findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+        for (MachineOperand *RUOp : DstReachingUses) {
+          if (TII->isMAI(*RUOp->getParent()))
+            continue;
+
+          // For any user of the result of the MFMA which is not an MFMA, we
+          // insert a copy. For a given register, we will only insert one copy
+          // per user block.
+          CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+          SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+          findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+          for (auto RDIndex : DstUsesReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+            if (TII->isMAI(*RD))
+              continue;
+
+            // For any definition of the user of the MFMA which is not an MFMA,
+            // we insert a copy. We do this to transform all the reaching defs
+            // of this use to AGPR. By doing this, we can insert a copy from
+            // AGPR to VGPR at the user rather than after the MFMA.
+            CopyForDef.insert(RD);
+          }
+        }
+
+        // Do the rewrite to allow for updated RP calculation.
+        const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+        const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+        DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+        if (Src2->isReg())
+          DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      }
+    }
+  }
+
+  return true;
+}
+
+int64_t RewriteScheduleStage::getRewriteCost(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MBFI.calculate(MF, MBPI, *DAG.MLI);
+  int64_t BestSpillCost = 0;
+  int64_t Cost = 0;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    if (!RegionsWithExcessArchVGPR[Region])
+      continue;
+
+    auto PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+
+    // For the cases we care about (i.e. ArchVGPR usage is greater than the
+    // addressable limit), rewriting alone should bring pressure to manageable
+    // level. If we find any such region, then the rewrite is potentially
+    // beneficial.
+    auto PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+
+    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+    uint64_t BlockFreq =
+        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+            .getFrequency();
+
+    bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+    uint64_t RelativeFreq = EntryFreq && BlockFreq
+                                ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+                                                       : BlockFreq / EntryFreq)
+                                : 1;
+
+    // This assumes perfect spilling / splitting -- using one spill / copy
+    // instruction and one restoreFrom / copy for each excess register,
+    int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+    // Also account for the block frequency.
+    if (RelativeFreqIsDenom)
+      SpillCost /= (int64_t)RelativeFreq;
+    else
+      SpillCost *= (int64_t)RelativeFreq;
+
+    // If we have increased spilling in any block, just bail.
+    if (SpillCost > 0)
+      return SpillCost;
+
+    if (SpillCost < BestSpillCost)
+      BestSpillCost = SpillCost;
+  }
+
+  // Set the cost to the largest decrease in spill cost in order to not double
+  // count spill reductions.
+  Cost = BestSpillCost;
+
+  assert(Cost <= 0);
+
+  unsigned CopyCost = 0;
+
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
+  // For each CopyForDef, increase the cost by the register size while
+  // accounting for block frequency.
+  for (auto *DefMI : CopyForDef) {
+    auto DefReg = DefMI->getOperand(0).getReg();
+    uint64_t DefFreq =
+        EntryFreq
+            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            : 1;
+
+    unsigned RegSize = DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(DefReg));
+    unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+    CopyCost += NumRegs * DefFreq;
+  }
+
+  // Account for CopyForUse copies in each block that the register is used.
+  for (auto &UseEntry : CopyForUse) {
+    uint64_t UseFreq =
+        EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
+                  : 1;
+
+    for (auto UseReg : UseEntry.second) {
+      unsigned RegSize =
+          DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
+      unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+      CopyCost += NumRegs * UseFreq;
+    }
+  }
+
+  Cost += CopyCost;
+
+  // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
+  // defs of the register may require VGPR.
+  for (auto RI : RewriteCands) {
+    MachineInstr *MI = RI.first;
+
+    assert(TII->isMAI(*MI));
+    const TargetRegisterClass *AGPRRC =
+        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    assert(Src2);
+
+    if (Src2->isReg()) {
+      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+    }
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(RI.second));
+  }
+
+  return Cost;
+}
+
+bool RewriteScheduleStage::rewrite(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+  DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    auto Entry = DAG.Regions[Region];
+    if (Entry.first == Entry.second)
+      continue;
+
+    FirstMIToRegion[&*Entry.first] = Region;
+    if (Entry.second != Entry.first->getParent()->end())
+      LastMIToRegion[&*Entry.second] = Region;
+  }
+
+  // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+  // The general assumption of the algorithm (and the previous cost calculation)
+  // is that it is better to insert the copies in the MBB of the def of the src2
+  // operands, and in the MBB of the user of the dest operands. This is based on
+  // the assumption that the MFMAs are likely to appear in loop bodies, while
+  // the src2 and dest operands are live-in / live-out of the loop. Due to this
+  // design, the algorithm for finding copy insertion points is more
+  // complicated.
+  //
+  // There are three main cases to handle: 1. the reaching defs of the src2
+  // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+  // defs of the reaching uses of the dst operand.
+  //
+  // In the first case, we simply insert copies after each of the reaching
+  // definitions. In the second case, we collect all the uses of a given dest
+  // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+  // earliest use. Since the use may have multiple reaching defs, and since we
+  // want to replace the register it is using with the result of the copy, we
+  // must handle case 3. In the third case, we simply insert a copy after each
+  // of the reaching defs to connect to the copy of the reaching uses of the dst
+  // reg. This allows us to avoid inserting copies next to the' MFMAs.
+  //
+  // While inserting the copies, we maintain a map of operands which will use
+  // different regs (i.e. the result of the copies). For example, a case 1 src2
+  // operand will use the register result of the copies after the reaching defs,
+  // as opposed to the original register. Now that we have completed our copy
+  // analysis and placement, we can bulk update the registers. We do this
+  // separately as to avoid complicating the reachingDef and reachingUse
+  // queries.
+  //
+  // While inserting the copies, we also maintain a list or registers which we
+  // will want to reclassify as AGPR. After doing the copy isnertion and the
+  // register replacement, we can finally do the reclassification. This uses the
+  // redef map, as the registers we are interested in reclassifying may be
+  // replaced by the result of a copy. We must do this after the copy analysis
+  // and placement as we must have an accurate redef map -- otherwise we may end
+  // up creating illegal instructions.
+
+  // The original registers of the MFMA that need to be reclassified as AGPR
+  std::set<Register> RewriteRegs;
+  // The map of an original register in the MFMA to a new register (result of a
+  // copy) that it should be replaced with.
+  DenseMap<Register, Register> RedefMap;
+  // The map of the original MFMA registers to the relevant MFMA operands.
+  DenseMap<Register, std::set<MachineOperand *>> ReplaceMap;
+  // The map of reaching defs for a given register -- to avoid duplicate copies.
+  DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
+  // The map of reaching uses for a given register by basic block -- to avoid
+  // duplicate copies and to calculate per MBB insert pts.
+  DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
+      ReachingUseTracker;
+
+  for (auto &RI : RewriteCands) {
+    MachineInstr &MI = *RI.first;
+
+    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+    if (ReplacementOp == -1)
+      continue;
+    MI.setDesc(TII->get(ReplacementOp));
+
+    // Case 1: insert copies for the reaching defs of the Src2Reg.
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+
+    if (Src2->isReg()) {
+      Register Src2Reg = Src2->getReg();
+      if (!Src2Reg.isVirtual())
+        return false;
+
+      Register MappedReg = Src2->getReg();
+      SmallVector<SlotIndex, 8> Src2ReachingDefs;
+      findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+      SmallVector<MachineInstr *, 8> Src2DefsReplace;
+
+      for (auto RDIndex : Src2ReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def, then we need a copy.
+        if (find(Src2DefsReplace, RD) == Src2DefsReplace.end())
+          Src2DefsReplace.push_back(RD);
+      }
+
+      if (!Src2DefsReplace.empty()) {
+        if (RedefMap.contains(Src2Reg))
+          MappedReg = RedefMap[Src2Reg];
+        else {
+          assert(!ReachingDefCopyMap.contains(Src2Reg));
+          const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
+          const TargetRegisterClass *VGPRRC =
+              SRI->getEquivalentVGPRClass(Src2RC);
+
+          // Track the mapping of the original register to the new register.
+          MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+          RedefMap[Src2Reg] = MappedReg;
+        }
+
+        // If none exists, create a copy from this reaching def.
+        // We may have inserted a copy already in an earlier iteration.
+        for (MachineInstr *RD : Src2DefsReplace) {
+          // Do not create redundant copies.
+          if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
+            MachineInstrBuilder VGPRCopy =
+                BuildMIAfter(*RD->getParent(), RD->getIterator(),
+                             RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                    .addDef(MappedReg, 0, 0)
+                    .addUse(Src2Reg, 0, 0);
+            DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+            // If this reaching def was the last MI in the region, update the
+            // region boundaries.
+            if (LastMIToRegion.contains(RD)) {
+              unsigned UpdateRegion = LastMIToRegion[RD];
+              DAG.Regions[UpdateRegion].second = VGPRCopy;
+              LastMIToRegion.erase(RD);
+            }
+          }
+        }
+      }
+
+      // Track the register for reclassification
+      RewriteRegs.insert(Src2Reg);
+
+      // Always insert the operand for replacement. If this corresponds with a
+      // chain of tied-def we may not see the VGPR requirement until later.
+      ReplaceMap[Src2Reg].insert(Src2);
+    }
+
+    // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
+    // and after the reaching defs of the reaching uses of the dsts.
+
+    MachineOperand *Dst = &MI.getOperand(0);
+    Register DstReg = Dst->getReg();
+    if (!DstReg.isVirtual())
+      return false;
+
+    Register MappedReg = DstReg;
+    SmallVector<MachineOperand *, 8> DstReachingUses;
+
+    SmallVector<MachineOperand *, 8> DstReachingUseCopies;
+    SmallVector<MachineInstr *, 8> DstUseDefsReplace;
+
+    findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+    for (MachineOperand *RUOp : DstReachingUses) {
+      if (TII->isMAI(*RUOp->getParent()))
+        continue;
+
+      // If there is a non mai reaching use, then we need a copy.
+      if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
+        DstReachingUseCopies.push_back(RUOp);
+      SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+      findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+      for (auto RDIndex : DstUsesReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def of this reaching use, then we will
+        // need a copy.
+        if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
+          DstUseDefsReplace.push_back(RD);
+      }
+    }
+
+    if (!DstUseDefsReplace.empty()) {
+      if (RedefMap.contains(DstReg))
+        MappedReg = RedefMap[DstReg];
+      else {
+        assert(!ReachingDefCopyMap.contains(DstReg));
+        const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+        const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+
+        // Track the mapping of the original register to the new register.
+        MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+        RedefMap[DstReg] = MappedReg;
+      }
+
+      // If none exists, create a copy from this reaching def.
+      // We may have inserted a copy already in an earlier iteration.
+      for (MachineInstr *RD : DstUseDefsReplace) {
+        // Do not create reundant copies.
+        if (ReachingDefCopyMap[DstReg].insert(RD).second) {
+          MachineInstrBuilder VGPRCopy =
+              BuildMIAfter(*RD->getParent(), RD->getIterator(),
+                           RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                  .addDef(MappedReg, 0, 0)
+                  .addUse(DstReg, 0, 0);
+          DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+          // If this reaching def was the last MI in the region, update the
+          // region boundaries.
+          if (LastMIToRegion.contains(RD)) {
+            unsigned UpdateRegion = LastMIToRegion[RD];
+            DAG.Regions[UpdateRegion].second = VGPRCopy;
+            LastMIToRegion.erase(RD);
+          }
+        }
+      }
+    }
+
+    for (MachineOperand *RU : DstReachingUseCopies) {
+      MachineBasicBlock *RUBlock = RU->getParent()->getParent();
+      // Just keep track of the reaching use of this register by block. After we
+      // have scanned all the MFMAs we can find optimal insert pts.
+      if (RUBlock != MI.getParent()) {
+        ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
+        continue;
+      }
+
+      // Special case, the use is in the same block as the MFMA. Insert the copy
+      // just before the use.
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = RU->getParent();
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, 0, 0)
+              .addUse(DstReg, 0, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+      // Since we know this use has only one reaching def, we can replace the
+      // use reg.
+      RU->setReg(NewUseReg);
+      // Track the copy source operand for replacement.
+      ReplaceMap[DstReg].insert(&VGPRCopy->getOperand(1));
+    }
+
+    // Track the register for reclassification
+    RewriteRegs.insert(DstReg);
+    // Insert the dst operand for replacement. If this dst is in a chain of
+    // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
+    // all the correspond operands need to be replaced.
+    ReplaceMap[DstReg].insert(Dst);
+  }
+
+  // Handle the copies for dst uses.
+  for (auto RUBlockEntry : ReachingUseTracker) {
+    for (auto RUDst : RUBlockEntry.second) {
+      MachineOperand *OpBegin = *RUDst.second.begin();
+      SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
+
+      // Find the earliest use in this block.
+      for (auto User : RUDst.second) {
+        SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
+        if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
+          InstPt = NewInstPt;
+      }
+
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
+
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, 0, 0)
+              .addUse(RUDst.first, 0, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+      // If this UseInst was the first MI in the region, update the region
+      // boundaries.
+      if (LastMIToRegion.contains(UseInst)) {
+        unsigned UpdateRegion = FirstMIToRegion[UseInst];
+        DAG.Regions[UpdateRegion].first = VGPRCopy;
+        LastMIToRegion.erase(UseInst);
+      }
+
+      // Replace the operand for all users.
+      for (auto User : RUDst.second) {
+        User->setReg(NewUseReg);
+      }
+
+      // Track the copy source operand for replacement.
+      ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
+    }
+  }
+
+  // We may have needed to insert copies after the reaching defs of the MFMAs.
+  // Replace the original register with the result of the copy for all relevant
+  // operands.
+  for (auto NewDef : RedefMap) {
+    Register OldReg = NewDef.first;
+    Register NewReg = NewDef.second;
+
+    // Replace the register for any associated operand in the MFMA chain.
+    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) {
+      ReplaceOp->setReg(NewReg);
+    }
+  }
+
+  // Finally, do the reclassification of the MFMA registers.
+  for (auto RewriteReg : RewriteRegs) {
+    Register RegToRewrite = RewriteReg;
+
+    // Be sure to update the replacement register and not the original.
+    if (RedefMap.contains(RewriteReg))
+      RegToRewrite = RedefMap[RewriteReg];
+
+    const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
+    const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
+
+    DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
+  }
+
+  // Bulk update the LIS.
+  DAG.LIS->reanalyze(DAG.MF);
+  // Liveins may have been modified for cross RC copies
+  RegionPressureMap LiveInUpdater(&DAG, false);
+  LiveInUpdater.buildLiveRegMap();
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
+    DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
+
+  return true;
+}
+
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const Function &F = MF.getFunction();
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 95a931b9beb2a..e2d4f49b4ef16 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
@@ -28,11 +31,12 @@ class GCNSchedStage;
 
 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
-  UnclusteredHighRPReschedule = 1,
-  ClusteredLowOccupancyReschedule = 2,
-  PreRARematerialize = 3,
-  ILPInitialSchedule = 4,
-  MemoryClauseInitialSchedule = 5
+  RewriteSchedule = 1,
+  UnclusteredHighRPReschedule = 2,
+  ClusteredLowOccupancyReschedule = 3,
+  PreRARematerialize = 4,
+  ILPInitialSchedule = 5,
+  MemoryClauseInitialSchedule = 6
 };
 
 #ifndef NDEBUG
@@ -239,6 +243,7 @@ using RegionBoundaries =
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
+  friend class RewriteScheduleStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
@@ -413,6 +418,61 @@ class OccInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class RewriteScheduleStage : public GCNSchedStage {
+private:
+  // Record regions with excess archvgpr register pressure over the physical
+  // register limit. Register pressure in these regions usually will result in
+  // spilling.
+  BitVector RegionsWithExcessArchVGPR;
+
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
+
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *SRI;
+
+  /// Do a speculative rewrite and collect copy locations. The speculative
+  /// rewrite allows us to calulcate the RP of the code after the rewrite, and
+  /// the copy locations allow us to calculate the total cost of copies required
+  /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
+  /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the
+  /// copy locations for defs (of the MFMA operands) in \p CopyForDef
+  bool
+  initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
+  /// in initHueristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+  /// costs, and \p RewriteCands to undo rewriting.
+  int64_t
+  getRewriteCost(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Do the final rewrite on \p RewriteCands and insert any needed copies.
+  bool rewrite(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+
+  /// \returns true if this MI is a rewrite candidate.
+  bool isRewriteCandidate(MachineInstr *MI) const;
+
+  /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
+  /// DefIdx
+  SlotIndex findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+                             SmallVectorImpl<SlotIndex> &DefIdxs);
+
+  /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
+  /// ReachingUses
+  void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS,
+                        SmallVectorImpl<MachineOperand *> &ReachingUses);
+
+public:
+  bool initGCNSchedStage() override;
+
+  RewriteScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
new file mode 100644
index 0000000000000..73eeafb6bccc5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -0,0 +1,5591 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+--- |
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_singleedef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @no_copy_for_mfma() #0 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+...
+
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.7:
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.7:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.5:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.8:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.8:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6]]
+  ; CHECK-NEXT:   KILL [[COPY10]], [[COPY5]], [[COPY12]], [[COPY7]], [[COPY14]], [[COPY9]], [[COPY16]], [[COPY11]], [[COPY6]], [[COPY13]], [[COPY8]], [[COPY15]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %194:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %195:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %196:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %197:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %198:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %199:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193, %194, %195, %196, %197, %198, %199
+
+
+  bb.3:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+
+  bb.6:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.3:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.6:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+
+  bb.7:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.9, implicit killed $scc
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.10
+
+  bb.9:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.10:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[COPY2]], [[COPY5]], [[COPY3]], [[COPY6]], [[COPY4]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY7]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[COPY2]], [[COPY5]], [[COPY3]], [[COPY6]], [[COPY4]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[COPY7]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193
+
+
+  bb.3:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_singleedef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_singleedef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+  bb.4:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 384, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 128, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 384, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 256, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.8:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+   %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %84.sub0:vreg_128_align2, 256, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY6]], 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 256, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.9:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.9:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 256, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[DEF21:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF21]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    %94:vreg_128_align2 = IMPLICIT_DEF
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.9:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY11]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    %94:vreg_128_align2 = IMPLICIT_DEF
+
+  bb.8:
+    %95:vgpr_32 = V_ADD_U32_e32 %84.sub0, %64, implicit $exec
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            no_copy_for_mfma
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: no_copy_for_mfma
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   dead [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF21]]
+  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF22]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %88:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %88:vreg_128_align2 = IMPLICIT_DEF
+    S_BRANCH %bb.4
+
+
+  bb.3:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
new file mode 100644
index 0000000000000..050e4bc5e941c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -0,0 +1,524 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+--- |
+  define void @more_copies_than_spills() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @less_copies_than_spills() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @low_pressure() {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+...
+
+
+---
+name:            more_copies_than_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: more_copies_than_spills
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF12]], [[DEF13]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.9(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.9, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.10(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub2, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[DEF18]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.10, implicit killed $scc
+
+  bb.9:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.10:
+   undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %85.sub0, implicit $exec
+   %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub2, %64, implicit $exec
+
+  bb.11:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %85.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+
+---
+name:            less_copies_than_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: less_copies_than_spills
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            low_pressure
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: low_pressure
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %5
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %6
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF9:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF13]].sub0, [[DEF5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF4]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF12]], [[DEF6]], [[DEF7]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %5, implicit %6
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...

>From 1f2991f613a8ff6717b96b01b374d4dc1355aa31 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 22 Aug 2025 16:14:19 -0700
Subject: [PATCH 08/35] Do not rewrite to AGPR if waves-per-eu >= 2

Change-Id: I4ab71a3c739a203399a201e47d6b37ceba723bf2
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1ad31fd455449..14be10e17ce29 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1297,6 +1297,8 @@ void RewriteScheduleStage::findReachingUses(
 
 bool RewriteScheduleStage::initGCNSchedStage() {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
+    return false;
 
   RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
   RegionsWithExcessArchVGPR.reset();
@@ -1306,7 +1308,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
       RegionsWithExcessArchVGPR[Region] = true;
   }
 
-  if (!ST.hasGFX90AInsts() || RegionsWithExcessArchVGPR.none())
+  if (RegionsWithExcessArchVGPR.none())
     return false;
 
   TII = ST.getInstrInfo();

>From 3e14b6372698705a55b6ad3fa516129bfefbc4ba Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 4 Sep 2025 16:22:01 -0700
Subject: [PATCH 09/35] Review comments

Change-Id: I99db02cea2777024b4948a55d6a298c384f40534
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |   3 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 130 ++++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  22 ++--
 3 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a87e1d984a626..07a8c3c34146c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -102,7 +102,8 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
-  unsigned getVGPRSpills(const GCNSubtarget &ST, MachineFunction &MF) {
+  unsigned getVGPRSpills(MachineFunction &MF) {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (!ST.hasGFX90AInsts())
       return 0;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 14be10e17ce29..09b73c4aaa9f1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1223,22 +1223,20 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
-SlotIndex
-RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
-                                       LiveIntervals *LIS,
-                                       SmallVectorImpl<SlotIndex> &DefIdxs) {
+void RewriteScheduleStage::findReachingDefs(
+    MachineOperand &UseMO, LiveIntervals *LIS,
+    SmallVectorImpl<SlotIndex> &DefIdxs) {
   assert(UseMO.isReg());
   MachineInstr *UseMI = UseMO.getParent();
   LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
-  auto VNInfo = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+  VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
 
-  SlotIndex DefMBBStart =
-      LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNInfo->def));
+  SlotIndex DefMBBStart = LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNI->def));
 
   // If the def is in the block, then it must be the only reaching def.
-  if (DefMBBStart != VNInfo->def) {
-    DefIdxs.push_back(VNInfo->def);
-    return VNInfo->def;
+  if (DefMBBStart != VNI->def) {
+    DefIdxs.push_back(VNI->def);
+    return;
   }
 
   SmallPtrSet<MachineBasicBlock *, 8> Visited;
@@ -1256,15 +1254,15 @@ RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
     MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
 
     SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
-    auto VNInfo = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+    VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
 
-    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNInfo->def);
+    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
     SlotIndex DefMBBStart = LIS->getMBBStartIdx(DefMBB);
 
     // If there is a def in this block, then add it to the list. This is the
     // reaching def of this path.
-    if (DefMBBStart != VNInfo->def) {
-      DefIdxs.push_back(VNInfo->def);
+    if (DefMBBStart != VNI->def) {
+      DefIdxs.push_back(VNI->def);
       continue;
     }
 
@@ -1273,8 +1271,6 @@ RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
         Worklist.push_back(PredMBB);
     }
   }
-
-  return VNInfo->def;
 }
 
 void RewriteScheduleStage::findReachingUses(
@@ -1288,9 +1284,9 @@ void RewriteScheduleStage::findReachingUses(
 
     // If we find a use that contains this DefMI in its reachingDefs, then it is
     // a reaching use.
-    if (find_if(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+    if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
           return SlotIndex::isSameInstr(RDIdx, DefIdx);
-        }) != ReachingDefIndexes.end())
+        }))
       ReachingUses.push_back(&UseMO);
   }
 }
@@ -1966,27 +1962,29 @@ bool RewriteScheduleStage::initHeuristics(
   // Prepare for the heuristics
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      if (isRewriteCandidate(&MI)) {
-        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-        if (ReplacementOp == -1)
-          continue;
+      if (!isRewriteCandidate(&MI))
+        continue;
 
-        RewriteCands.push_back({&MI, MI.getOpcode()});
-        MI.setDesc(TII->get(ReplacementOp));
+      int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+      if (ReplacementOp == -1)
+        continue;
 
-        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-        if (Src2->isReg()) {
-          SmallVector<SlotIndex, 8> Src2ReachingDefs;
-          findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+      RewriteCands.push_back({&MI, MI.getOpcode()});
+      MI.setDesc(TII->get(ReplacementOp));
 
-          // For any definition of the src2 register which is non-MFMA, we
-          // insert a copy.
-          for (SlotIndex RDIdx : Src2ReachingDefs) {
-            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
-            if (!TII->isMAI(*RD))
-              CopyForDef.insert(RD);
-          }
+      MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2->isReg()) {
+        SmallVector<SlotIndex, 8> Src2ReachingDefs;
+        findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+        // For any definition of the src2 register which is non-MFMA, we
+        // insert a copy.
+        for (SlotIndex RDIdx : Src2ReachingDefs) {
+          MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+          if (!TII->isMAI(*RD))
+            CopyForDef.insert(RD);
         }
+      }
 
         MachineOperand &Dst = MI.getOperand(0);
         SmallVector<MachineOperand *, 8> DstReachingUses;
@@ -2024,7 +2022,6 @@ bool RewriteScheduleStage::initHeuristics(
         DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
         if (Src2->isReg())
           DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
-      }
     }
   }
 
@@ -2032,28 +2029,32 @@ bool RewriteScheduleStage::initHeuristics(
 }
 
 int64_t RewriteScheduleStage::getRewriteCost(
-    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
-    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
-    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
+
   MBFI.calculate(MF, MBPI, *DAG.MLI);
   int64_t BestSpillCost = 0;
   int64_t Cost = 0;
 
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
     if (!RegionsWithExcessArchVGPR[Region])
       continue;
 
-    auto PressureBefore = DAG.Pressure[Region];
-    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+    GCNRegPressure &PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(MF);
 
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
     // beneficial.
-    auto PressureAfter = DAG.getRealRegPressure(Region);
-    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+    GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(MF);
 
-    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
     uint64_t BlockFreq =
         MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
@@ -2090,8 +2091,6 @@ int64_t RewriteScheduleStage::getRewriteCost(
 
   unsigned CopyCost = 0;
 
-  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
-
   // For each CopyForDef, increase the cost by the register size while
   // accounting for block frequency.
   for (auto *DefMI : CopyForDef) {
@@ -2107,12 +2106,11 @@ int64_t RewriteScheduleStage::getRewriteCost(
   }
 
   // Account for CopyForUse copies in each block that the register is used.
-  for (auto &UseEntry : CopyForUse) {
+  for (auto &[UseBlock, UseRegs] : CopyForUse) {
     uint64_t UseFreq =
-        EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
-                  : 1;
+        EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
-    for (auto UseReg : UseEntry.second) {
+    for (auto UseReg : UseRegs) {
       unsigned RegSize =
           DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
       unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
@@ -2124,9 +2122,7 @@ int64_t RewriteScheduleStage::getRewriteCost(
 
   // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
   // defs of the register may require VGPR.
-  for (auto RI : RewriteCands) {
-    MachineInstr *MI = RI.first;
-
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
     assert(TII->isMAI(*MI));
     const TargetRegisterClass *AGPRRC =
         DAG.MRI.getRegClass(MI->getOperand(0).getReg());
@@ -2135,18 +2131,17 @@ int64_t RewriteScheduleStage::getRewriteCost(
     MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
     assert(Src2);
 
-    if (Src2->isReg()) {
+    if (Src2->isReg())
       DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
-    }
     DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
-    MI->setDesc(TII->get(RI.second));
+    MI->setDesc(TII->get(OriginalOpcode));
   }
 
   return Cost;
 }
 
 bool RewriteScheduleStage::rewrite(
-    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
   DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
   DenseMap<MachineInstr *, unsigned> LastMIToRegion;
 
@@ -2180,7 +2175,7 @@ bool RewriteScheduleStage::rewrite(
   // want to replace the register it is using with the result of the copy, we
   // must handle case 3. In the third case, we simply insert a copy after each
   // of the reaching defs to connect to the copy of the reaching uses of the dst
-  // reg. This allows us to avoid inserting copies next to the' MFMAs.
+  // reg. This allows us to avoid inserting copies next to the MFMAs.
   //
   // While inserting the copies, we maintain a map of operands which will use
   // different regs (i.e. the result of the copies). For example, a case 1 src2
@@ -2191,14 +2186,14 @@ bool RewriteScheduleStage::rewrite(
   // queries.
   //
   // While inserting the copies, we also maintain a list or registers which we
-  // will want to reclassify as AGPR. After doing the copy isnertion and the
+  // will want to reclassify as AGPR. After doing the copy insertion and the
   // register replacement, we can finally do the reclassification. This uses the
   // redef map, as the registers we are interested in reclassifying may be
   // replaced by the result of a copy. We must do this after the copy analysis
   // and placement as we must have an accurate redef map -- otherwise we may end
   // up creating illegal instructions.
 
-  // The original registers of the MFMA that need to be reclassified as AGPR
+  // The original registers of the MFMA that need to be reclassified as AGPR.
   std::set<Register> RewriteRegs;
   // The map of an original register in the MFMA to a new register (result of a
   // copy) that it should be replaced with.
@@ -2212,16 +2207,15 @@ bool RewriteScheduleStage::rewrite(
   DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
       ReachingUseTracker;
 
-  for (auto &RI : RewriteCands) {
-    MachineInstr &MI = *RI.first;
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
 
-    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
     if (ReplacementOp == -1)
       continue;
-    MI.setDesc(TII->get(ReplacementOp));
+    MI->setDesc(TII->get(ReplacementOp));
 
     // Case 1: insert copies for the reaching defs of the Src2Reg.
-    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
 
     if (Src2->isReg()) {
       Register Src2Reg = Src2->getReg();
@@ -2291,7 +2285,7 @@ bool RewriteScheduleStage::rewrite(
     // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
     // and after the reaching defs of the reaching uses of the dsts.
 
-    MachineOperand *Dst = &MI.getOperand(0);
+    MachineOperand *Dst = &MI->getOperand(0);
     Register DstReg = Dst->getReg();
     if (!DstReg.isVirtual())
       return false;
@@ -2302,7 +2296,7 @@ bool RewriteScheduleStage::rewrite(
     SmallVector<MachineOperand *, 8> DstReachingUseCopies;
     SmallVector<MachineInstr *, 8> DstUseDefsReplace;
 
-    findReachingUses(&MI, DAG.LIS, DstReachingUses);
+    findReachingUses(MI, DAG.LIS, DstReachingUses);
 
     for (MachineOperand *RUOp : DstReachingUses) {
       if (TII->isMAI(*RUOp->getParent()))
@@ -2366,7 +2360,7 @@ bool RewriteScheduleStage::rewrite(
       MachineBasicBlock *RUBlock = RU->getParent()->getParent();
       // Just keep track of the reaching use of this register by block. After we
       // have scanned all the MFMAs we can find optimal insert pts.
-      if (RUBlock != MI.getParent()) {
+      if (RUBlock != MI->getParent()) {
         ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
         continue;
       }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e2d4f49b4ef16..f7a9f79bf7364 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -425,9 +425,6 @@ class RewriteScheduleStage : public GCNSchedStage {
   // spilling.
   BitVector RegionsWithExcessArchVGPR;
 
-  MachineBranchProbabilityInfo MBPI;
-  MachineBlockFrequencyInfo MBFI;
-
   const SIInstrInfo *TII;
   const SIRegisterInfo *SRI;
 
@@ -443,23 +440,24 @@ class RewriteScheduleStage : public GCNSchedStage {
                  SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
-  /// in initHueristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+  /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
-  int64_t
-  getRewriteCost(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
-                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
-                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+  int64_t getRewriteCost(
+      const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+      const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+      const SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
   /// Do the final rewrite on \p RewriteCands and insert any needed copies.
-  bool rewrite(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+  bool
+  rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
 
   /// \returns true if this MI is a rewrite candidate.
   bool isRewriteCandidate(MachineInstr *MI) const;
 
   /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
-  /// DefIdx
-  SlotIndex findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
-                             SmallVectorImpl<SlotIndex> &DefIdxs);
+  /// DefIdxs
+  void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+                        SmallVectorImpl<SlotIndex> &DefIdxs);
 
   /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
   /// ReachingUses

>From 4778b115ecfd969108644825cefe26982a3c18c7 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Thu, 20 Nov 2025 09:30:11 -0600
Subject: [PATCH 10/35] merge of PR 149367

---
 llvm/include/llvm/CodeGen/MachineInstrBuilder.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index 7f389952bd765..46b5c96135e5a 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -527,8 +527,7 @@ inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
   MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
   BB.insertAfter(I, MI);
   return MachineInstrBuilder(MF, MI)
-      .setPCSections(MIMD.getPCSections())
-      .setMMRAMetadata(MIMD.getMMRAMetadata());
+      .copyMIMetadata(MIMD);
 }
 
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,

>From 27489c1100e92bb02f3cea6b40679c418e036ba2 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 25 Nov 2025 08:20:48 -0600
Subject: [PATCH 11/35] Address PR 149367 review comments

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 18 ++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  2 +-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 09b73c4aaa9f1..f74c91229a86e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1245,7 +1245,7 @@ void RewriteScheduleStage::findReachingDefs(
   Visited.insert(UseMI->getParent());
 
   // Mark the predecessor blocks for traversal
-  for (auto PredMBB : UseMI->getParent()->predecessors()) {
+  for (auto *PredMBB : UseMI->getParent()->predecessors()) {
     Worklist.push_back(PredMBB);
     Visited.insert(PredMBB);
   }
@@ -1266,7 +1266,7 @@ void RewriteScheduleStage::findReachingDefs(
       continue;
     }
 
-    for (auto PredMBB : DefMBB->predecessors()) {
+    for (auto *PredMBB : DefMBB->predecessors()) {
       if (Visited.insert(PredMBB).second)
         Worklist.push_back(PredMBB);
     }
@@ -1966,8 +1966,7 @@ bool RewriteScheduleStage::initHeuristics(
         continue;
 
       int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-      if (ReplacementOp == -1)
-        continue;
+      assert(ReplacementOp != -1)
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
       MI.setDesc(TII->get(ReplacementOp));
@@ -2238,9 +2237,9 @@ bool RewriteScheduleStage::rewrite(
       }
 
       if (!Src2DefsReplace.empty()) {
-        if (RedefMap.contains(Src2Reg))
+        if (RedefMap.contains(Src2Reg)) {
           MappedReg = RedefMap[Src2Reg];
-        else {
+        } else {
           assert(!ReachingDefCopyMap.contains(Src2Reg));
           const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
           const TargetRegisterClass *VGPRRC =
@@ -2399,7 +2398,7 @@ bool RewriteScheduleStage::rewrite(
       SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
 
       // Find the earliest use in this block.
-      for (auto User : RUDst.second) {
+      for (auto *User : RUDst.second) {
         SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
         if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
           InstPt = NewInstPt;
@@ -2426,7 +2425,7 @@ bool RewriteScheduleStage::rewrite(
       }
 
       // Replace the operand for all users.
-      for (auto User : RUDst.second) {
+      for (auto *User : RUDst.second) {
         User->setReg(NewUseReg);
       }
 
@@ -2443,9 +2442,8 @@ bool RewriteScheduleStage::rewrite(
     Register NewReg = NewDef.second;
 
     // Replace the register for any associated operand in the MFMA chain.
-    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) {
+    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
       ReplaceOp->setReg(NewReg);
-    }
   }
 
   // Finally, do the reclassification of the MFMA registers.
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f7a9f79bf7364..76dba27d6f83f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -429,7 +429,7 @@ class RewriteScheduleStage : public GCNSchedStage {
   const SIRegisterInfo *SRI;
 
   /// Do a speculative rewrite and collect copy locations. The speculative
-  /// rewrite allows us to calulcate the RP of the code after the rewrite, and
+  /// rewrite allows us to calculate the RP of the code after the rewrite, and
   /// the copy locations allow us to calculate the total cost of copies required
   /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
   /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the

>From 72ac441f9a4a6cc3f4f8987e9000b7d2b361a7cd Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 1 Dec 2025 11:09:22 -0600
Subject: [PATCH 12/35] Address PR 149367 review comments

---
 .../llvm/CodeGen/MachineInstrBuilder.h        | 23 ++++++++-----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 20 ++++++++--------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index 46b5c96135e5a..af5d91d09a608 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -432,6 +432,15 @@ class MachineInstrBuilder {
     return *this;
   }
 
+  /// Inserts the newly-built instruction after the given position in the
+  /// given MachineBasicBlock.
+  const MachineInstrBuilder &insertAfter(MachineInstr *MInstr) const {
+    MachineBasicBlock *MBB = MInstr->getParent();
+    MachineBasicBlock::iterator I = MInstr->getIterator();
+    MBB->insertAfter(I, MI);
+    return *this;
+  }
+
   bool constrainAllUses(const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI,
                         const RegisterBankInfo &RBI) const {
@@ -516,20 +525,6 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
   return MachineInstrBuilder(MF, MI).copyMIMetadata(MIMD);
 }
 
-/// This version of the builder inserts the newly-built instruction after the
-/// given position in the given MachineBasicBlock, and does NOT take a
-/// destination register.
-inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
-                                        MachineBasicBlock::iterator I,
-                                        const MIMetadata &MIMD,
-                                        const MCInstrDesc &MCID) {
-  MachineFunction &MF = *BB.getParent();
-  MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
-  BB.insertAfter(I, MI);
-  return MachineInstrBuilder(MF, MI)
-      .copyMIMetadata(MIMD);
-}
-
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::instr_iterator I,
                                    const MIMetadata &MIMD,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f74c91229a86e..d9894551050a8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1966,7 +1966,7 @@ bool RewriteScheduleStage::initHeuristics(
         continue;
 
       int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-      assert(ReplacementOp != -1)
+      assert(ReplacementOp != -1);
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
       MI.setDesc(TII->get(ReplacementOp));
@@ -2256,10 +2256,10 @@ bool RewriteScheduleStage::rewrite(
           // Do not create redundant copies.
           if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
             MachineInstrBuilder VGPRCopy =
-                BuildMIAfter(*RD->getParent(), RD->getIterator(),
-                             RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                     .addDef(MappedReg, 0, 0)
-                    .addUse(Src2Reg, 0, 0);
+                    .addUse(Src2Reg, 0, 0)
+                    .insertAfter(RD);
             DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
             // If this reaching def was the last MI in the region, update the
@@ -2338,10 +2338,10 @@ bool RewriteScheduleStage::rewrite(
         // Do not create reundant copies.
         if (ReachingDefCopyMap[DstReg].insert(RD).second) {
           MachineInstrBuilder VGPRCopy =
-              BuildMIAfter(*RD->getParent(), RD->getIterator(),
-                           RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                   .addDef(MappedReg, 0, 0)
-                  .addUse(DstReg, 0, 0);
+                  .addUse(DstReg, 0, 0)
+                  .insertAfter(RD);
           DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
           // If this reaching def was the last MI in the region, update the
@@ -2418,10 +2418,10 @@ bool RewriteScheduleStage::rewrite(
 
       // If this UseInst was the first MI in the region, update the region
       // boundaries.
-      if (LastMIToRegion.contains(UseInst)) {
+      if (FirstMIToRegion.contains(UseInst)) {
         unsigned UpdateRegion = FirstMIToRegion[UseInst];
         DAG.Regions[UpdateRegion].first = VGPRCopy;
-        LastMIToRegion.erase(UseInst);
+        FirstMIToRegion.erase(UseInst);
       }
 
       // Replace the operand for all users.
@@ -2469,6 +2469,8 @@ bool RewriteScheduleStage::rewrite(
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
     DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
 
+  DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
+
   return true;
 }
 

>From c65d1c425601278b99b72f79c5492bd325e7ac20 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 2 Dec 2025 10:03:06 -0600
Subject: [PATCH 13/35] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
 to AGPR

---
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 2082 ++++++++---------
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  114 +-
 2 files changed, 1098 insertions(+), 1098 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index 73eeafb6bccc5..f485b088c8034 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -215,42 +215,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -316,42 +316,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -418,46 +418,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -465,7 +461,11 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -538,55 +538,55 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -662,32 +662,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -695,16 +691,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -719,12 +715,16 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -805,32 +805,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -838,16 +834,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -861,14 +857,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -951,30 +951,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -982,14 +978,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -1002,16 +998,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1026,12 +1022,16 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1124,30 +1124,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -1155,14 +1151,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -1175,16 +1171,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1198,14 +1194,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1299,33 +1299,29 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1337,21 +1333,25 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1427,33 +1427,29 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1465,25 +1461,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1563,46 +1563,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1614,23 +1610,27 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1716,46 +1716,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1767,25 +1763,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1871,33 +1871,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
@@ -1905,22 +1900,22 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY2]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -1949,21 +1944,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2061,32 +2061,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
@@ -2094,16 +2090,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2126,25 +2122,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2238,30 +2238,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -2269,14 +2265,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -2289,16 +2285,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -2321,23 +2317,27 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2441,30 +2441,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -2472,14 +2468,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -2492,10 +2488,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -2507,25 +2503,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2622,41 +2622,36 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2671,9 +2666,14 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY7]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY7]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY7]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2752,41 +2752,37 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2801,9 +2797,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[COPY7]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[COPY7]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY7]], [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2882,51 +2882,51 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3000,55 +3000,55 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3122,52 +3122,52 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3240,53 +3240,53 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3360,45 +3360,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -3411,19 +3406,24 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3506,45 +3506,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -3557,20 +3552,25 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 384, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 384, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3656,33 +3656,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3694,21 +3689,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY2]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3783,33 +3783,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3821,25 +3816,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3920,46 +3920,41 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3971,23 +3966,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4074,46 +4074,41 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -4125,25 +4120,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4230,43 +4230,38 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4278,21 +4273,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY6]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY6]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4375,43 +4375,38 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4423,25 +4418,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4528,45 +4528,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -4579,10 +4574,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4594,23 +4589,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4705,45 +4705,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -4756,10 +4751,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4771,25 +4766,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4886,42 +4886,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4989,30 +4989,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -5020,14 +5016,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -5040,10 +5036,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -5055,16 +5051,16 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY5]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[DEF21:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -5072,7 +5068,11 @@ body:             |
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF21]]
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF19]], [[DEF20]], [[DEF21]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF18]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF17]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5168,32 +5168,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -5201,16 +5197,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -5224,14 +5220,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY11]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY11]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5314,46 +5314,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -5365,24 +5361,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5468,68 +5468,68 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   dead [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF21]]
-  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF22]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF17]]
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF20]], [[DEF21]], [[DEF22]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF19]], [[DEF13]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
index 050e4bc5e941c..2982c99c3fa7b 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -45,40 +45,33 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -90,7 +83,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF12]], [[DEF13]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -101,8 +97,8 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
@@ -114,19 +110,23 @@ body:             |
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   successors: %bb.10(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF18]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub2, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF11]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub2, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.10:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF11]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[DEF18]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[DEF11]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -229,42 +229,35 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF11]]
+  ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -277,7 +270,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF14]], [[DEF15]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -289,22 +285,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF10]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:

>From 386538cd18236a14d140b9c79274a06401458c96 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 2 Dec 2025 12:59:08 -0600
Subject: [PATCH 14/35] Remove unnecessary second hasFFX90AInsts conditional
 check

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 07a8c3c34146c..b95bc8bf5b150 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -120,15 +120,11 @@ struct GCNRegPressure {
     unsigned AGPRSpill =
         AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
 
-    unsigned UnifiedSpill = 0;
-
-    if (ST.hasGFX90AInsts()) {
-      unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
-      unsigned UnifiedPressure = getVGPRNum(true);
-      UnifiedSpill = UnifiedPressure > CombinedThreshold
-                         ? (UnifiedPressure - CombinedThreshold)
-                         : 0;
-    }
+    unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+    unsigned UnifiedPressure = getVGPRNum(true);
+    unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
+                                ? (UnifiedPressure - CombinedThreshold)
+                                : 0;
 
     return std::max(UnifiedSpill, (ArchSpill + AGPRSpill));
   }

>From 08729f9c78c9bfec688ce99f6dce382a2cafe76e Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Thu, 4 Dec 2025 10:38:47 -0600
Subject: [PATCH 15/35] Format fixes from review comments.

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  5 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 58 ++++++++++-----------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index b95bc8bf5b150..f35c252d3bb05 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -107,7 +107,8 @@ struct GCNRegPressure {
     if (!ST.hasGFX90AInsts())
       return 0;
 
-    auto MaxVectorRegs = ST.getMaxNumVectorRegs(MF.getFunction());
+    std::pair<unsigned, unsigned> MaxVectorRegs =
+        ST.getMaxNumVectorRegs(MF.getFunction());
     unsigned ArchVGPRThreshold = MaxVectorRegs.first;
     unsigned AGPRThreshold = MaxVectorRegs.second;
 
@@ -121,7 +122,7 @@ struct GCNRegPressure {
         AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
 
     unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
-    unsigned UnifiedPressure = getVGPRNum(true);
+    unsigned UnifiedPressure = getVGPRNum(/*UnifiedVGPRFile=*/true);
     unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
                                 ? (UnifiedPressure - CombinedThreshold)
                                 : 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d9894551050a8..962c6b9b16268 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1299,7 +1299,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
   RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
   RegionsWithExcessArchVGPR.reset();
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
-    auto PressureBefore = DAG.Pressure[Region];
+    GCNRegPressure PressureBefore = DAG.Pressure[Region];
     if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
       RegionsWithExcessArchVGPR[Region] = true;
   }
@@ -1985,42 +1985,42 @@ bool RewriteScheduleStage::initHeuristics(
         }
       }
 
-        MachineOperand &Dst = MI.getOperand(0);
-        SmallVector<MachineOperand *, 8> DstReachingUses;
+      MachineOperand &Dst = MI.getOperand(0);
+      SmallVector<MachineOperand *, 8> DstReachingUses;
 
-        findReachingUses(&MI, DAG.LIS, DstReachingUses);
+      findReachingUses(&MI, DAG.LIS, DstReachingUses);
 
-        for (MachineOperand *RUOp : DstReachingUses) {
-          if (TII->isMAI(*RUOp->getParent()))
-            continue;
+      for (MachineOperand *RUOp : DstReachingUses) {
+        if (TII->isMAI(*RUOp->getParent()))
+          continue;
 
-          // For any user of the result of the MFMA which is not an MFMA, we
-          // insert a copy. For a given register, we will only insert one copy
-          // per user block.
-          CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+        // For any user of the result of the MFMA which is not an MFMA, we
+        // insert a copy. For a given register, we will only insert one copy
+        // per user block.
+        CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
 
-          SmallVector<SlotIndex, 8> DstUsesReachingDefs;
-          findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+        SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+        findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
 
-          for (auto RDIndex : DstUsesReachingDefs) {
-            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
-            if (TII->isMAI(*RD))
-              continue;
+        for (auto RDIndex : DstUsesReachingDefs) {
+          MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+          if (TII->isMAI(*RD))
+            continue;
 
-            // For any definition of the user of the MFMA which is not an MFMA,
-            // we insert a copy. We do this to transform all the reaching defs
-            // of this use to AGPR. By doing this, we can insert a copy from
-            // AGPR to VGPR at the user rather than after the MFMA.
-            CopyForDef.insert(RD);
-          }
+          // For any definition of the user of the MFMA which is not an MFMA,
+          // we insert a copy. We do this to transform all the reaching defs
+          // of this use to AGPR. By doing this, we can insert a copy from
+          // AGPR to VGPR at the user rather than after the MFMA.
+          CopyForDef.insert(RD);
         }
+      }
 
-        // Do the rewrite to allow for updated RP calculation.
-        const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
-        const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
-        DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
-        if (Src2->isReg())
-          DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      // Do the rewrite to allow for updated RP calculation.
+      const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+      const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+      DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+      if (Src2->isReg())
+        DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
     }
   }
 

>From 53b396949e19066314186937abcf460b4d19f9f8 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Thu, 4 Dec 2025 11:51:47 -0600
Subject: [PATCH 16/35] Use VNInfo api to check for PHI def rather than doing
 it manually

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 962c6b9b16268..8ef915039a215 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1231,10 +1231,8 @@ void RewriteScheduleStage::findReachingDefs(
   LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
   VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
 
-  SlotIndex DefMBBStart = LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNI->def));
-
-  // If the def is in the block, then it must be the only reaching def.
-  if (DefMBBStart != VNI->def) {
+  // If the def is not a PHI, then it must be the only reaching def.
+  if (!VNI->isPHIDef()) {
     DefIdxs.push_back(VNI->def);
     return;
   }
@@ -1257,11 +1255,10 @@ void RewriteScheduleStage::findReachingDefs(
     VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
 
     MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
-    SlotIndex DefMBBStart = LIS->getMBBStartIdx(DefMBB);
 
     // If there is a def in this block, then add it to the list. This is the
     // reaching def of this path.
-    if (DefMBBStart != VNI->def) {
+    if (!VNI->isPHIDef()) {
       DefIdxs.push_back(VNI->def);
       continue;
     }

>From 89e1fce428011ab3c62a47300ab23da30e5ae67d Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 5 Dec 2025 11:36:43 -0600
Subject: [PATCH 17/35] Review comments

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 49 ++++++++++++---------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8ef915039a215..b5d58eac9fe48 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1999,7 +1999,7 @@ bool RewriteScheduleStage::initHeuristics(
         SmallVector<SlotIndex, 8> DstUsesReachingDefs;
         findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
 
-        for (auto RDIndex : DstUsesReachingDefs) {
+        for (SlotIndex RDIndex : DstUsesReachingDefs) {
           MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
           if (TII->isMAI(*RD))
             continue;
@@ -2106,7 +2106,7 @@ int64_t RewriteScheduleStage::getRewriteCost(
     uint64_t UseFreq =
         EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
-    for (auto UseReg : UseRegs) {
+    for (Register UseReg : UseRegs) {
       unsigned RegSize =
           DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
       unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
@@ -2142,7 +2142,7 @@ bool RewriteScheduleStage::rewrite(
   DenseMap<MachineInstr *, unsigned> LastMIToRegion;
 
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
-    auto Entry = DAG.Regions[Region];
+   RegionBoundaries Entry = DAG.Regions[Region];
     if (Entry.first == Entry.second)
       continue;
 
@@ -2190,7 +2190,7 @@ bool RewriteScheduleStage::rewrite(
   // up creating illegal instructions.
 
   // The original registers of the MFMA that need to be reclassified as AGPR.
-  std::set<Register> RewriteRegs;
+  DenseSet<Register> RewriteRegs;
   // The map of an original register in the MFMA to a new register (result of a
   // copy) that it should be replaced with.
   DenseMap<Register, Register> RedefMap;
@@ -2204,7 +2204,6 @@ bool RewriteScheduleStage::rewrite(
       ReachingUseTracker;
 
   for (auto &[MI, OriginalOpcode] : RewriteCands) {
-
     int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
     if (ReplacementOp == -1)
       continue;
@@ -2212,7 +2211,6 @@ bool RewriteScheduleStage::rewrite(
 
     // Case 1: insert copies for the reaching defs of the Src2Reg.
     MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
-
     if (Src2->isReg()) {
       Register Src2Reg = Src2->getReg();
       if (!Src2Reg.isVirtual())
@@ -2223,7 +2221,7 @@ bool RewriteScheduleStage::rewrite(
       findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
       SmallVector<MachineInstr *, 8> Src2DefsReplace;
 
-      for (auto RDIndex : Src2ReachingDefs) {
+      for (SlotIndex RDIndex : Src2ReachingDefs) {
         MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
         if (TII->isMAI(*RD))
           continue;
@@ -2234,8 +2232,9 @@ bool RewriteScheduleStage::rewrite(
       }
 
       if (!Src2DefsReplace.empty()) {
-        if (RedefMap.contains(Src2Reg)) {
-          MappedReg = RedefMap[Src2Reg];
+        DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
+        if (RI != RedefMap.end()) {
+          MappedReg = RI->second;
         } else {
           assert(!ReachingDefCopyMap.contains(Src2Reg));
           const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
@@ -2304,7 +2303,7 @@ bool RewriteScheduleStage::rewrite(
       SmallVector<SlotIndex, 8> DstUsesReachingDefs;
       findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
 
-      for (auto RDIndex : DstUsesReachingDefs) {
+      for (SlotIndex RDIndex : DstUsesReachingDefs) {
         MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
         if (TII->isMAI(*RD))
           continue;
@@ -2317,9 +2316,10 @@ bool RewriteScheduleStage::rewrite(
     }
 
     if (!DstUseDefsReplace.empty()) {
-      if (RedefMap.contains(DstReg))
-        MappedReg = RedefMap[DstReg];
-      else {
+      DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
+      if (RI != RedefMap.end()) {
+        MappedReg = RI->second;
+      } else {
         assert(!ReachingDefCopyMap.contains(DstReg));
         const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
         const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
@@ -2343,8 +2343,10 @@ bool RewriteScheduleStage::rewrite(
 
           // If this reaching def was the last MI in the region, update the
           // region boundaries.
-          if (LastMIToRegion.contains(RD)) {
-            unsigned UpdateRegion = LastMIToRegion[RD];
+          DenseMap<MachineInstr *, unsigned>::iterator LMI =
+              LastMIToRegion.find(RD);
+          if (LMI != LastMIToRegion.end()) {
+            unsigned UpdateRegion = LMI->second;
             DAG.Regions[UpdateRegion].second = VGPRCopy;
             LastMIToRegion.erase(RD);
           }
@@ -2389,13 +2391,16 @@ bool RewriteScheduleStage::rewrite(
   }
 
   // Handle the copies for dst uses.
-  for (auto RUBlockEntry : ReachingUseTracker) {
-    for (auto RUDst : RUBlockEntry.second) {
+  using RUBType =
+      std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
+  for (RUBType RUBlockEntry : ReachingUseTracker) {
+    using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
+    for (RUDType RUDst : RUBlockEntry.second) {
       MachineOperand *OpBegin = *RUDst.second.begin();
       SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
 
       // Find the earliest use in this block.
-      for (auto *User : RUDst.second) {
+      for (MachineOperand *User : RUDst.second) {
         SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
         if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
           InstPt = NewInstPt;
@@ -2415,8 +2420,10 @@ bool RewriteScheduleStage::rewrite(
 
       // If this UseInst was the first MI in the region, update the region
       // boundaries.
-      if (FirstMIToRegion.contains(UseInst)) {
-        unsigned UpdateRegion = FirstMIToRegion[UseInst];
+      DenseMap<MachineInstr *, unsigned>::iterator FI =
+          FirstMIToRegion.find(UseInst);
+      if (FI != FirstMIToRegion.end()) {
+        unsigned UpdateRegion = FI->second;
         DAG.Regions[UpdateRegion].first = VGPRCopy;
         FirstMIToRegion.erase(UseInst);
       }
@@ -2434,7 +2441,7 @@ bool RewriteScheduleStage::rewrite(
   // We may have needed to insert copies after the reaching defs of the MFMAs.
   // Replace the original register with the result of the copy for all relevant
   // operands.
-  for (auto NewDef : RedefMap) {
+  for (std::pair<Register, Register> NewDef : RedefMap) {
     Register OldReg = NewDef.first;
     Register NewReg = NewDef.second;
 

>From f37c7b5fce36eea3cd340ab98ac64b351797b753 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 8 Dec 2025 10:26:44 -0600
Subject: [PATCH 18/35] Review comments

---
 .../llvm/CodeGen/MachineInstrBuilder.h        |    9 -
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   25 +-
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 1232 ++++++-----------
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |   18 +-
 4 files changed, 455 insertions(+), 829 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index af5d91d09a608..eb9bcfb7c01a3 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -432,15 +432,6 @@ class MachineInstrBuilder {
     return *this;
   }
 
-  /// Inserts the newly-built instruction after the given position in the
-  /// given MachineBasicBlock.
-  const MachineInstrBuilder &insertAfter(MachineInstr *MInstr) const {
-    MachineBasicBlock *MBB = MInstr->getParent();
-    MachineBasicBlock::iterator I = MInstr->getIterator();
-    MBB->insertAfter(I, MI);
-    return *this;
-  }
-
   bool constrainAllUses(const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI,
                         const RegisterBankInfo &RBI) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b5d58eac9fe48..369450e7538e6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2096,9 +2096,8 @@ int64_t RewriteScheduleStage::getRewriteCost(
             ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
             : 1;
 
-    unsigned RegSize = DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(DefReg));
-    unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
-    CopyCost += NumRegs * DefFreq;
+    const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
+    CopyCost += RC->getCopyCost() * DefFreq;
   }
 
   // Account for CopyForUse copies in each block that the register is used.
@@ -2107,10 +2106,8 @@ int64_t RewriteScheduleStage::getRewriteCost(
         EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
     for (Register UseReg : UseRegs) {
-      unsigned RegSize =
-          DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
-      unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
-      CopyCost += NumRegs * UseFreq;
+      const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
+      CopyCost += RC->getCopyCost() * UseFreq;
     }
   }
 
@@ -2118,7 +2115,7 @@ int64_t RewriteScheduleStage::getRewriteCost(
 
   // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
   // defs of the register may require VGPR.
-  for (auto &[MI, OriginalOpcode] : RewriteCands) {
+  for (auto [MI, OriginalOpcode] : RewriteCands) {
     assert(TII->isMAI(*MI));
     const TargetRegisterClass *AGPRRC =
         DAG.MRI.getRegClass(MI->getOperand(0).getReg());
@@ -2252,10 +2249,10 @@ bool RewriteScheduleStage::rewrite(
           // Do not create redundant copies.
           if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
             MachineInstrBuilder VGPRCopy =
-                BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+                        RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                     .addDef(MappedReg, 0, 0)
-                    .addUse(Src2Reg, 0, 0)
-                    .insertAfter(RD);
+                    .addUse(Src2Reg, 0, 0);
             DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
             // If this reaching def was the last MI in the region, update the
@@ -2335,10 +2332,10 @@ bool RewriteScheduleStage::rewrite(
         // Do not create reundant copies.
         if (ReachingDefCopyMap[DstReg].insert(RD).second) {
           MachineInstrBuilder VGPRCopy =
-              BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+                      RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                   .addDef(MappedReg, 0, 0)
-                  .addUse(DstReg, 0, 0)
-                  .insertAfter(RD);
+                  .addUse(DstReg, 0, 0);
           DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
           // If this reaching def was the last MI in the region, update the
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index f485b088c8034..44e5563ce7adb 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -228,29 +228,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -330,28 +325,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -438,34 +428,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -558,35 +542,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -661,70 +639,55 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -804,71 +767,56 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF12]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -950,43 +898,36 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -998,40 +939,31 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1123,43 +1055,36 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -1171,41 +1096,32 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF12]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1312,16 +1228,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1332,26 +1247,20 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1441,15 +1350,14 @@ body:             |
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1460,30 +1368,24 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1583,22 +1485,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1609,28 +1509,22 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1736,22 +1630,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1762,30 +1654,24 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1870,69 +1756,51 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF10]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY2]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF10]], implicit $exec
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF12]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF13]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6]]
-  ; CHECK-NEXT:   KILL [[COPY10]], [[COPY5]], [[COPY12]], [[COPY7]], [[COPY14]], [[COPY9]], [[COPY16]], [[COPY11]], [[COPY6]], [[COPY13]], [[COPY8]], [[COPY15]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
@@ -1943,27 +1811,22 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF10]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF10]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF16]], [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF15]], [[DEF11]], [[DEF12]], [[DEF13]], [[DEF14]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2060,57 +1923,45 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
@@ -2121,30 +1972,25 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF12]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2237,43 +2083,36 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2285,27 +2124,21 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.7(0x40000000)
@@ -2316,28 +2149,23 @@ body:             |
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2455,13 +2283,9 @@ body:             |
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -2469,14 +2293,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2488,10 +2310,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -2502,30 +2324,24 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DEF16]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2901,32 +2717,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3019,36 +2832,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3137,37 +2944,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3255,38 +3053,29 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3374,12 +3163,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -3387,14 +3172,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3406,24 +3189,20 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3520,12 +3299,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -3533,14 +3308,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3552,25 +3325,21 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 384, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 384, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3666,7 +3435,6 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3674,10 +3442,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3688,27 +3456,21 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY2]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3793,7 +3555,6 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3801,10 +3562,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3815,31 +3576,25 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3939,22 +3694,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3965,29 +3718,23 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4093,22 +3840,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -4119,31 +3864,25 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4245,23 +3984,18 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4272,27 +4006,21 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY6]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4390,23 +4118,18 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4417,31 +4140,25 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4542,12 +4259,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -4555,14 +4268,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4574,10 +4285,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4588,29 +4299,23 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4719,12 +4424,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -4732,14 +4433,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4751,10 +4450,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4765,31 +4464,25 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[DEF13]], [[DEF14]], [[DEF15]], [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4899,29 +4592,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5003,13 +4691,9 @@ body:             |
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -5017,14 +4701,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -5036,10 +4718,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -5050,29 +4732,23 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF19]], [[DEF20]], [[DEF21]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF18]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF17]]
+  ; CHECK-NEXT:   KILL [[DEF19]], [[DEF20]], [[DEF21]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF18]], [[DEF13]], [[DEF14]], [[DEF15]], [[DEF16]], [[V_ADD_U32_e32_]], [[DEF17]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5167,71 +4843,56 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
-  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY11]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF12]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF11]], [[V_ADD_U32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5334,22 +4995,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -5360,29 +5019,23 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5481,8 +5134,6 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   dead [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -5491,45 +5142,38 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF17]]
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF15]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF20]], [[DEF21]], [[DEF22]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF19]], [[DEF13]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[DEF15]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
index 2982c99c3fa7b..ab5a5cfd345a4 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -240,7 +240,6 @@ body:             |
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF11]]
   ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
@@ -251,14 +250,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -273,7 +270,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF14]], [[DEF15]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -284,27 +281,24 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF9]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF9]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF10]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[DEF11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:

>From 28284fd3804019009145ef885cd38b15b77cfcaa Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 9 Dec 2025 11:56:37 -0600
Subject: [PATCH 19/35] Review comments

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 ++++++++++-----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  8 +++----
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 369450e7538e6..07b4b1a533af2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -691,7 +691,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
-  SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
+  SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -948,8 +948,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
-  case GCNSchedStageID::RewriteSchedule:
-    return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::RewriteMFMAForm:
+    return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -1187,7 +1187,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
-  case GCNSchedStageID::RewriteSchedule:
+  case GCNSchedStageID::RewriteMFMAForm:
     OS << "Instruction Rewriting Reschedule";
     break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
@@ -1223,10 +1223,9 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
-void RewriteScheduleStage::findReachingDefs(
+void RewriteMFMAFormStage::findReachingDefs(
     MachineOperand &UseMO, LiveIntervals *LIS,
     SmallVectorImpl<SlotIndex> &DefIdxs) {
-  assert(UseMO.isReg());
   MachineInstr *UseMI = UseMO.getParent();
   LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
   VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
@@ -1239,7 +1238,6 @@ void RewriteScheduleStage::findReachingDefs(
 
   SmallPtrSet<MachineBasicBlock *, 8> Visited;
   SmallVector<MachineBasicBlock *, 8> Worklist;
-
   Visited.insert(UseMI->getParent());
 
   // Mark the predecessor blocks for traversal
@@ -1270,11 +1268,11 @@ void RewriteScheduleStage::findReachingDefs(
   }
 }
 
-void RewriteScheduleStage::findReachingUses(
+void RewriteMFMAFormStage::findReachingUses(
     MachineInstr *DefMI, LiveIntervals *LIS,
     SmallVectorImpl<MachineOperand *> &ReachingUses) {
   SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
-  for (auto &UseMO :
+  for (MachineOperand &UseMO :
        DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
     SmallVector<SlotIndex, 8> ReachingDefIndexes;
     findReachingDefs(UseMO, LIS, ReachingDefIndexes);
@@ -1288,7 +1286,7 @@ void RewriteScheduleStage::findReachingUses(
   }
 }
 
-bool RewriteScheduleStage::initGCNSchedStage() {
+bool RewriteMFMAFormStage::initGCNSchedStage() {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
     return false;
@@ -1945,14 +1943,14 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
 
   if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
     return false;
   return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
 }
 
-bool RewriteScheduleStage::initHeuristics(
+bool RewriteMFMAFormStage::initHeuristics(
     std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
@@ -2024,7 +2022,7 @@ bool RewriteScheduleStage::initHeuristics(
   return true;
 }
 
-int64_t RewriteScheduleStage::getRewriteCost(
+int64_t RewriteMFMAFormStage::getRewriteCost(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
@@ -2133,7 +2131,7 @@ int64_t RewriteScheduleStage::getRewriteCost(
   return Cost;
 }
 
-bool RewriteScheduleStage::rewrite(
+bool RewriteMFMAFormStage::rewrite(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
   DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
   DenseMap<MachineInstr *, unsigned> LastMIToRegion;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 76dba27d6f83f..0a79da061ab8e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -31,7 +31,7 @@ class GCNSchedStage;
 
 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
-  RewriteSchedule = 1,
+  RewriteMFMAForm = 1,
   UnclusteredHighRPReschedule = 2,
   ClusteredLowOccupancyReschedule = 3,
   PreRARematerialize = 4,
@@ -243,7 +243,7 @@ using RegionBoundaries =
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
-  friend class RewriteScheduleStage;
+  friend class RewriteMFMAFormStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
@@ -418,7 +418,7 @@ class OccInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
-class RewriteScheduleStage : public GCNSchedStage {
+class RewriteMFMAFormStage : public GCNSchedStage {
 private:
   // Record regions with excess archvgpr register pressure over the physical
   // register limit. Register pressure in these regions usually will result in
@@ -467,7 +467,7 @@ class RewriteScheduleStage : public GCNSchedStage {
 public:
   bool initGCNSchedStage() override;
 
-  RewriteScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+  RewriteMFMAFormStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };
 

>From e5698b559bcc43aed9095ddeb7a3b015246128b1 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 9 Dec 2025 11:58:20 -0600
Subject: [PATCH 20/35] Review comments: add MachineBlockFrequencyAnalysis as
 preserved to PHI elimination, unreachable block elimination and SI lower
 control flow (AMDGPU) passes.

---
 llvm/lib/CodeGen/UnreachableBlockElim.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index db6532c9d533a..1f767fab77a4c 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -131,7 +131,11 @@ UnreachableMachineBlockElimPass::run(MachineFunction &MF,
   return getMachineFunctionPassPreservedAnalyses()
       .preserve<MachineLoopAnalysis>()
       .preserve<MachineDominatorTreeAnalysis>()
+<<<<<<< HEAD
       .preserve<MachinePostDominatorTreeAnalysis>()
+=======
+      .preserve<MachinePostDominatorTreeAnalysis>();
+>>>>>>> 4c54902ed730 (Review comments: add MachineBlockFrequencyAnalysis as preserved to PHI elimination,)
       .preserve<MachineBlockFrequencyAnalysis>();
 }
 

>From 192bd088f658c4cce4ce4b6fa942f21904a91d95 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 9 Dec 2025 14:24:54 -0600
Subject: [PATCH 21/35] Review comments: only reset instruction descriptor when
 the rewrite is actually going to be performed.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  36 +-
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 688 +++++++++---------
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  16 +-
 3 files changed, 358 insertions(+), 382 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 07b4b1a533af2..fcf9f64180330 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1955,8 +1955,8 @@ bool RewriteMFMAFormStage::initHeuristics(
     DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
   // Prepare for the heuristics
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
       if (!isRewriteCandidate(&MI))
         continue;
 
@@ -1964,7 +1964,6 @@ bool RewriteMFMAFormStage::initHeuristics(
       assert(ReplacementOp != -1);
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
-      MI.setDesc(TII->get(ReplacementOp));
 
       MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
       if (Src2->isReg()) {
@@ -1975,7 +1974,7 @@ bool RewriteMFMAFormStage::initHeuristics(
         // insert a copy.
         for (SlotIndex RDIdx : Src2ReachingDefs) {
           MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
-          if (!TII->isMAI(*RD))
+          if (!isRewriteCandidate(RD))
             CopyForDef.insert(RD);
         }
       }
@@ -2026,14 +2025,10 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
-  MachineBranchProbabilityInfo MBPI;
-  MachineBlockFrequencyInfo MBFI;
-
-  MBFI.calculate(MF, MBPI, *DAG.MLI);
+  MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
   int64_t BestSpillCost = 0;
   int64_t Cost = 0;
-
-  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
 
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
     if (!RegionsWithExcessArchVGPR[Region])
@@ -2109,26 +2104,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     }
   }
 
-  Cost += CopyCost;
-
-  // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
-  // defs of the register may require VGPR.
-  for (auto [MI, OriginalOpcode] : RewriteCands) {
-    assert(TII->isMAI(*MI));
-    const TargetRegisterClass *AGPRRC =
-        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
-    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
-
-    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
-    assert(Src2);
-
-    if (Src2->isReg())
-      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
-    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
-    MI->setDesc(TII->get(OriginalOpcode));
-  }
-
-  return Cost;
+  return Cost + CopyCost;
 }
 
 bool RewriteMFMAFormStage::rewrite(
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index 44e5563ce7adb..56a307d2afb56 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -228,15 +228,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
@@ -324,15 +324,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
@@ -427,21 +427,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -541,21 +541,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
@@ -652,27 +652,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -680,14 +680,14 @@ body:             |
   ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -780,27 +780,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -911,23 +911,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -939,16 +939,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1068,23 +1068,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -1096,16 +1096,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1228,15 +1228,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1349,15 +1349,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1484,21 +1484,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1629,21 +1629,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1756,46 +1756,47 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF10]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF10]], implicit $exec
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF12]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF13]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -1811,22 +1812,21 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF10]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF10]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF16]], [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF15]], [[DEF11]], [[DEF12]], [[DEF13]], [[DEF14]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1936,27 +1936,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2096,23 +2096,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2124,16 +2124,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -2282,23 +2282,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2310,10 +2310,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -2447,7 +2447,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
@@ -2577,7 +2577,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
@@ -2716,21 +2716,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -2831,21 +2831,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
@@ -2938,24 +2938,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 0, 0, implicit $exec
@@ -3047,24 +3047,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
@@ -3158,26 +3158,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3189,10 +3189,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 128, 0, implicit $exec
@@ -3294,26 +3294,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3325,10 +3325,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
@@ -3434,7 +3434,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3442,10 +3442,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3554,7 +3554,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3562,10 +3562,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3693,21 +3693,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3839,21 +3839,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3978,24 +3978,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4112,24 +4112,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4254,26 +4254,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4285,10 +4285,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4419,26 +4419,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4450,10 +4450,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4592,15 +4592,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
@@ -4690,23 +4690,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4718,10 +4718,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4856,27 +4856,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -4994,21 +4994,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -5134,6 +5134,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   dead [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -5141,30 +5142,29 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF15]].sub0, [[DEF12]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
index ab5a5cfd345a4..40f87e838d314 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -55,7 +55,7 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
@@ -65,13 +65,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -86,7 +86,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -239,7 +239,7 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
@@ -249,13 +249,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -270,7 +270,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)

>From 142362fddc504806ddc66b538ff516f86a56b17a Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 9 Dec 2025 16:44:35 -0600
Subject: [PATCH 22/35] Review comments: remove map lookup inside loop

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fcf9f64180330..b02f7817f61f7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2325,6 +2325,7 @@ bool RewriteMFMAFormStage::rewrite(
       }
     }
 
+    std::set<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
     for (MachineOperand *RU : DstReachingUseCopies) {
       MachineBasicBlock *RUBlock = RU->getParent()->getParent();
       // Just keep track of the reaching use of this register by block. After we
@@ -2350,15 +2351,16 @@ bool RewriteMFMAFormStage::rewrite(
       // use reg.
       RU->setReg(NewUseReg);
       // Track the copy source operand for replacement.
-      ReplaceMap[DstReg].insert(&VGPRCopy->getOperand(1));
+      DstRegSet.insert(&VGPRCopy->getOperand(1));
     }
 
     // Track the register for reclassification
     RewriteRegs.insert(DstReg);
+
     // Insert the dst operand for replacement. If this dst is in a chain of
     // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
     // all the correspond operands need to be replaced.
-    ReplaceMap[DstReg].insert(Dst);
+    DstRegSet.insert(Dst);
   }
 
   // Handle the copies for dst uses.

>From febbd6ece868774dfff664d446f3e3a58f95e6b8 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 16 Dec 2025 09:35:51 -0600
Subject: [PATCH 23/35] Remove calculation of MBFI.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b02f7817f61f7..55a7326118038 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2045,7 +2045,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     unsigned SpillCostAfter = PressureAfter.getVGPRSpills(MF);
 
     uint64_t BlockFreq =
-        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+        MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
 
     bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
@@ -2086,7 +2086,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     auto DefReg = DefMI->getOperand(0).getReg();
     uint64_t DefFreq =
         EntryFreq
-            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
             : 1;
 
     const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
@@ -2096,7 +2096,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   // Account for CopyForUse copies in each block that the register is used.
   for (auto &[UseBlock, UseRegs] : CopyForUse) {
     uint64_t UseFreq =
-        EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+        EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
     for (Register UseReg : UseRegs) {
       const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);

>From e6919ee639a80336b2fb91b48c955c1ea5b21149 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 17 Dec 2025 10:07:32 -0600
Subject: [PATCH 24/35] Add flag to disable mfma form rewrite stage in
 scheduler

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 23 +++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 55a7326118038..e0c16543e1821 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -91,6 +91,11 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
     cl::init(false));
 #endif
 
+static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
+    "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
+    cl::desc("Disable rewrie mfma rewrite scheduling stage."),
+    cl::init(false));
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -1287,6 +1292,9 @@ void RewriteMFMAFormStage::findReachingUses(
 }
 
 bool RewriteMFMAFormStage::initGCNSchedStage() {
+  if (DisableRewriteMFMAFormSchedStage)
+    return false;
+
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
     return false;
@@ -2025,10 +2033,13 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
-  MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
+
+  MBFI.calculate(MF, MBPI, *DAG.MLI);
   int64_t BestSpillCost = 0;
   int64_t Cost = 0;
-  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
 
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
     if (!RegionsWithExcessArchVGPR[Region])
@@ -2045,7 +2056,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     unsigned SpillCostAfter = PressureAfter.getVGPRSpills(MF);
 
     uint64_t BlockFreq =
-        MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
+        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
 
     bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
@@ -2086,7 +2097,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     auto DefReg = DefMI->getOperand(0).getReg();
     uint64_t DefFreq =
         EntryFreq
-            ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
             : 1;
 
     const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
@@ -2096,7 +2107,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   // Account for CopyForUse copies in each block that the register is used.
   for (auto &[UseBlock, UseRegs] : CopyForUse) {
     uint64_t UseFreq =
-        EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+        EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
     for (Register UseReg : UseRegs) {
       const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
@@ -2350,7 +2361,7 @@ bool RewriteMFMAFormStage::rewrite(
       // Since we know this use has only one reaching def, we can replace the
       // use reg.
       RU->setReg(NewUseReg);
-      // Track the copy source operand for replacement.
+      // Track the copy source operand for r eplacement.
       DstRegSet.insert(&VGPRCopy->getOperand(1));
     }
 

>From 72932cc44d328ae4cda49dcb3b257d4ecd3e9a16 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 12 Jan 2026 10:21:08 -0600
Subject: [PATCH 25/35] Try to fix merge conflict

---
 llvm/lib/CodeGen/UnreachableBlockElim.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index 1f767fab77a4c..db6532c9d533a 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -131,11 +131,7 @@ UnreachableMachineBlockElimPass::run(MachineFunction &MF,
   return getMachineFunctionPassPreservedAnalyses()
       .preserve<MachineLoopAnalysis>()
       .preserve<MachineDominatorTreeAnalysis>()
-<<<<<<< HEAD
       .preserve<MachinePostDominatorTreeAnalysis>()
-=======
-      .preserve<MachinePostDominatorTreeAnalysis>();
->>>>>>> 4c54902ed730 (Review comments: add MachineBlockFrequencyAnalysis as preserved to PHI elimination,)
       .preserve<MachineBlockFrequencyAnalysis>();
 }
 

>From bd4804d103f2dd29c5a8a4a46fc143d70c3ef86e Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 14 Jan 2026 10:55:01 -0600
Subject: [PATCH 26/35] Review comments.

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  9 ++-----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 30 +++++++++++++--------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index f35c252d3bb05..3e8a33f6af235 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -102,16 +102,12 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
-  unsigned getVGPRSpills(MachineFunction &MF) {
+  unsigned getVGPRSpills(MachineFunction &MF, unsigned ArchVGPRThreshold,
+                         unsigned AGPRThreshold, unsigned CombinedThreshold) {
     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (!ST.hasGFX90AInsts())
       return 0;
 
-    std::pair<unsigned, unsigned> MaxVectorRegs =
-        ST.getMaxNumVectorRegs(MF.getFunction());
-    unsigned ArchVGPRThreshold = MaxVectorRegs.first;
-    unsigned AGPRThreshold = MaxVectorRegs.second;
-
     unsigned ArchPressure = getArchVGPRNum();
     unsigned AGPRPressure = getAGPRNum();
 
@@ -121,7 +117,6 @@ struct GCNRegPressure {
     unsigned AGPRSpill =
         AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
 
-    unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
     unsigned UnifiedPressure = getVGPRNum(/*UnifiedVGPRFile=*/true);
     unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
                                 ? (UnifiedPressure - CombinedThreshold)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e0c16543e1821..e936a5b53a686 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -93,7 +93,7 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
 
 static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
     "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
-    cl::desc("Disable rewrie mfma rewrite scheduling stage."),
+    cl::desc("Disable rewrie mfma rewrite scheduling stage"),
     cl::init(false));
 
 const unsigned ScheduleMetrics::ScaleFactor = 100;
@@ -696,7 +696,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
-  SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
+  if (!DisableRewriteMFMAFormSchedStage)
+    SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -1241,12 +1242,11 @@ void RewriteMFMAFormStage::findReachingDefs(
     return;
   }
 
-  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
   SmallVector<MachineBasicBlock *, 8> Worklist;
-  Visited.insert(UseMI->getParent());
 
   // Mark the predecessor blocks for traversal
-  for (auto *PredMBB : UseMI->getParent()->predecessors()) {
+  for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
     Worklist.push_back(PredMBB);
     Visited.insert(PredMBB);
   }
@@ -1266,7 +1266,7 @@ void RewriteMFMAFormStage::findReachingDefs(
       continue;
     }
 
-    for (auto *PredMBB : DefMBB->predecessors()) {
+    for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
       if (Visited.insert(PredMBB).second)
         Worklist.push_back(PredMBB);
     }
@@ -1292,9 +1292,9 @@ void RewriteMFMAFormStage::findReachingUses(
 }
 
 bool RewriteMFMAFormStage::initGCNSchedStage() {
-  if (DisableRewriteMFMAFormSchedStage)
-    return false;
-
+  // We only need to run this pass if the architecture supports AGPRs.
+  // Additionally, we don't use AGPRs at occupancy levels above 1 so there
+  // is no need for this pass in that case, either.
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
     return false;
@@ -2041,19 +2041,27 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   int64_t Cost = 0;
   uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
 
+  std::pair<unsigned, unsigned> MaxVectorRegs =
+      ST.getMaxNumVectorRegs(MF.getFunction());
+  unsigned ArchVGPRThreshold = MaxVectorRegs.first;
+  unsigned AGPRThreshold = MaxVectorRegs.second;
+  unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
     if (!RegionsWithExcessArchVGPR[Region])
       continue;
 
     GCNRegPressure &PressureBefore = DAG.Pressure[Region];
-    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(MF);
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
+        MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
 
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
     // beneficial.
     GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
-    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(MF);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
+        MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
 
     uint64_t BlockFreq =
         MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())

>From 77bba1242374dbe873ac885e1a2b9a0d1d03c51d Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 16 Jan 2026 09:33:48 -0600
Subject: [PATCH 27/35] Fix weird merge thing

---
 llvm/lib/CodeGen/MachineScheduler.cpp | 6 ++----
 llvm/lib/CodeGen/PHIElimination.cpp   | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 9c369a14696a1..b44d96609c170 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -420,8 +420,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass);
 INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
-MachineSchedulerLegacy::MachineSchedulerLegacy()
-    : MachineFunctionPass(ID) {
+MachineSchedulerLegacy::MachineSchedulerLegacy() : MachineFunctionPass(ID) {
   initializeMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry());
 }
 
@@ -709,8 +708,7 @@ MachineSchedulerPass::run(MachineFunction &MF,
   return getMachineFunctionPassPreservedAnalyses()
       .preserveSet<CFGAnalyses>()
       .preserve<SlotIndexesAnalysis>()
-      .preserve<LiveIntervalsAnalysis>()
-      .preserve<MachineBlockFrequencyAnalysis>();
+      .preserve<LiveIntervalsAnalysis>();
 }
 
 bool PostMachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index e6a5183cf528c..1c054cca72cde 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -129,7 +129,7 @@ class PHIEliminationImpl {
     auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
     auto *MDTWrapper =
         P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
-    auto *PDTWrapper = 
+    auto *PDTWrapper =
         P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
     auto *MBPIWrapper =
         P->getAnalysisIfAvailable<MachineBranchProbabilityInfoWrapperPass>();

>From fe41dab0d5dc12378177d21b4784bd22fee51ccd Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 16 Jan 2026 11:02:34 -0600
Subject: [PATCH 28/35] Review comments

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  2 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3e8a33f6af235..c55796c37f287 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -122,7 +122,7 @@ struct GCNRegPressure {
                                 ? (UnifiedPressure - CombinedThreshold)
                                 : 0;
 
-    return std::max(UnifiedSpill, (ArchSpill + AGPRSpill));
+    return std::max(UnifiedSpill, ArchSpill + AGPRSpill);
   }
 
   void inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e936a5b53a686..2fcd30564578f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2033,13 +2033,11 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
-  MachineBranchProbabilityInfo MBPI;
-  MachineBlockFrequencyInfo MBFI;
+  MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
 
-  MBFI.calculate(MF, MBPI, *DAG.MLI);
   int64_t BestSpillCost = 0;
   int64_t Cost = 0;
-  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
 
   std::pair<unsigned, unsigned> MaxVectorRegs =
       ST.getMaxNumVectorRegs(MF.getFunction());
@@ -2064,7 +2062,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
 
     uint64_t BlockFreq =
-        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+        MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
 
     bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
@@ -2105,7 +2103,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     auto DefReg = DefMI->getOperand(0).getReg();
     uint64_t DefFreq =
         EntryFreq
-            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
             : 1;
 
     const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
@@ -2115,7 +2113,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   // Account for CopyForUse copies in each block that the register is used.
   for (auto &[UseBlock, UseRegs] : CopyForUse) {
     uint64_t UseFreq =
-        EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+        EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
     for (Register UseReg : UseRegs) {
       const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
@@ -2421,7 +2419,7 @@ bool RewriteMFMAFormStage::rewrite(
       }
 
       // Replace the operand for all users.
-      for (auto *User : RUDst.second) {
+      for (MachineOperand *User : RUDst.second) {
         User->setReg(NewUseReg);
       }
 
@@ -2443,12 +2441,13 @@ bool RewriteMFMAFormStage::rewrite(
   }
 
   // Finally, do the reclassification of the MFMA registers.
-  for (auto RewriteReg : RewriteRegs) {
+  for (Register RewriteReg : RewriteRegs) {
     Register RegToRewrite = RewriteReg;
 
     // Be sure to update the replacement register and not the original.
-    if (RedefMap.contains(RewriteReg))
-      RegToRewrite = RedefMap[RewriteReg];
+    DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
+    if (RI != RedefMap.end())
+      RegToRewrite = RI->second;
 
     const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
     const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);

>From ae50ec8d93c8d235e8c92e7b9dc95901affdc3b2 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 16 Jan 2026 11:12:08 -0600
Subject: [PATCH 29/35] Fix formatting errors

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2fcd30564578f..379d494ea5d27 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -93,8 +93,7 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
 
 static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
     "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
-    cl::desc("Disable rewrie mfma rewrite scheduling stage"),
-    cl::init(false));
+    cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(false));
 
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
@@ -2130,7 +2129,7 @@ bool RewriteMFMAFormStage::rewrite(
   DenseMap<MachineInstr *, unsigned> LastMIToRegion;
 
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
-   RegionBoundaries Entry = DAG.Regions[Region];
+    RegionBoundaries Entry = DAG.Regions[Region];
     if (Entry.first == Entry.second)
       continue;
 

>From 825a1a3636584cb83bb0c1be0db03b94c9071307 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 16 Jan 2026 13:31:14 -0600
Subject: [PATCH 30/35] Review comments.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 379d494ea5d27..e65ffba7ee9ce 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2098,7 +2098,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
 
   // For each CopyForDef, increase the cost by the register size while
   // accounting for block frequency.
-  for (auto *DefMI : CopyForDef) {
+  for (MachineInstr *DefMI : CopyForDef) {
     auto DefReg = DefMI->getOperand(0).getReg();
     uint64_t DefFreq =
         EntryFreq
@@ -2206,7 +2206,7 @@ bool RewriteMFMAFormStage::rewrite(
       Register MappedReg = Src2->getReg();
       SmallVector<SlotIndex, 8> Src2ReachingDefs;
       findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
-      SmallVector<MachineInstr *, 8> Src2DefsReplace;
+      SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
 
       for (SlotIndex RDIndex : Src2ReachingDefs) {
         MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
@@ -2214,8 +2214,7 @@ bool RewriteMFMAFormStage::rewrite(
           continue;
 
         // If there is a non mai reaching def, then we need a copy.
-        if (find(Src2DefsReplace, RD) == Src2DefsReplace.end())
-          Src2DefsReplace.push_back(RD);
+        Src2DefsReplace.insert(RD);
       }
 
       if (!Src2DefsReplace.empty()) {

>From 23a77f0ec94141340f459e09ce8af888fe86fb5a Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Fri, 16 Jan 2026 13:34:24 -0600
Subject: [PATCH 31/35] Review comments.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e65ffba7ee9ce..85592affd2eed 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2099,7 +2099,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   // For each CopyForDef, increase the cost by the register size while
   // accounting for block frequency.
   for (MachineInstr *DefMI : CopyForDef) {
-    auto DefReg = DefMI->getOperand(0).getReg();
+    Register DefReg = DefMI->getOperand(0).getReg();
     uint64_t DefFreq =
         EntryFreq
             ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq

>From 44f069fb3e22307d2d3d36dcf05da7409d3a53ad Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 19 Jan 2026 06:24:44 -0600
Subject: [PATCH 32/35] Review comments

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 85592affd2eed..8aa6eedafdbcb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2182,7 +2182,7 @@ bool RewriteMFMAFormStage::rewrite(
   // copy) that it should be replaced with.
   DenseMap<Register, Register> RedefMap;
   // The map of the original MFMA registers to the relevant MFMA operands.
-  DenseMap<Register, std::set<MachineOperand *>> ReplaceMap;
+  DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
   // The map of reaching defs for a given register -- to avoid duplicate copies.
   DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
   // The map of reaching uses for a given register by basic block -- to avoid
@@ -2340,7 +2340,7 @@ bool RewriteMFMAFormStage::rewrite(
       }
     }
 
-    std::set<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
+    DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
     for (MachineOperand *RU : DstReachingUseCopies) {
       MachineBasicBlock *RUBlock = RU->getParent()->getParent();
       // Just keep track of the reaching use of this register by block. After we

>From 73bbb8c4849f851bfd1063e99fa6f1c0df24bcec Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 20 Jan 2026 10:08:58 -0600
Subject: [PATCH 33/35] Restore register classes back to VGPR if we determine
 rewrite is unprofitable

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  22 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |   5 +
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 684 +++++++++---------
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  16 +-
 4 files changed, 375 insertions(+), 352 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8aa6eedafdbcb..ad4c36af3dcf8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1323,8 +1323,10 @@ bool RewriteMFMAFormStage::initGCNSchedStage() {
 
   // If we haven't found the beneficial conditions, prefer the VGPR form which
   // may result in less cross RC copies.
-  if (Cost > 0)
+  if (Cost > 0) {
+    restoreRegClasses(RewriteCands);
     return false;
+  }
 
   return rewrite(RewriteCands);
 }
@@ -1961,6 +1963,8 @@ bool RewriteMFMAFormStage::initHeuristics(
     std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
     DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
     SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  bool Changed = false;
+
   // Prepare for the heuristics
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -2022,10 +2026,11 @@ bool RewriteMFMAFormStage::initHeuristics(
       DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
       if (Src2->isReg())
         DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      Changed = true;
     }
   }
 
-  return true;
+  return Changed;
 }
 
 int64_t RewriteMFMAFormStage::getRewriteCost(
@@ -2123,6 +2128,19 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   return Cost + CopyCost;
 }
 
+void RewriteMFMAFormStage::restoreRegClasses(
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
+    MachineOperand &Dst = MI->getOperand(0);
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    const TargetRegisterClass *AGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+    DAG.MRI.setRegClass(Dst.getReg(), VGPRRC);
+    if (Src2->isReg())
+      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+  }
+}
+
 bool RewriteMFMAFormStage::rewrite(
     const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
   DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 0a79da061ab8e..37149c841f09f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -439,6 +439,11 @@ class RewriteMFMAFormStage : public GCNSchedStage {
                  DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
                  SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
+  /// Restore the register classes speculatively rewritten by initHueristics if
+  /// we exit without rewriting the instructions permanently.
+  void restoreRegClasses(
+      const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+
   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
   /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index 56a307d2afb56..1b41b61a9b57d 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -228,15 +228,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]].sub0, [[DEF12]], implicit $exec
@@ -324,15 +324,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
@@ -427,21 +427,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -541,21 +541,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
@@ -652,27 +652,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -680,14 +680,14 @@ body:             |
   ; CHECK-NEXT:   KILL [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -780,27 +780,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -911,23 +911,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -939,16 +939,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1068,23 +1068,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -1096,16 +1096,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1228,15 +1228,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1349,15 +1349,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1484,21 +1484,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1629,21 +1629,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1756,47 +1756,46 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF10]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF10]], implicit $exec
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF12]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF13]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF7]], [[DEF8]], [[DEF14]], 4, 4, [[DEF9]].sub0, [[DEF10]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -1812,21 +1811,22 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub1, [[DEF10]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[V_ADD_U32_e32_]].sub0, [[DEF10]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[DEF13]], [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   KILL [[DEF16]], [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF15]], [[DEF11]], [[DEF12]], [[DEF13]], [[DEF14]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1936,27 +1936,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2096,23 +2096,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2124,16 +2124,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -2282,23 +2282,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -2310,10 +2310,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -2716,21 +2716,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -2831,21 +2831,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
@@ -2938,24 +2938,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 0, 0, implicit $exec
@@ -3047,24 +3047,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
@@ -3158,26 +3158,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3189,10 +3189,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 128, 0, implicit $exec
@@ -3294,26 +3294,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -3325,10 +3325,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
@@ -3434,7 +3434,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3442,10 +3442,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3554,7 +3554,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
@@ -3562,10 +3562,10 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3693,21 +3693,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3839,21 +3839,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3978,24 +3978,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4112,24 +4112,24 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4254,26 +4254,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4285,10 +4285,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4419,26 +4419,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4450,10 +4450,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DS_READ_B128_gfx9_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF13]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF14]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[DEF15]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4592,15 +4592,15 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3]], 0, 0, implicit $exec
@@ -4690,23 +4690,23 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -4718,10 +4718,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF14]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF16]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4856,27 +4856,27 @@ body:             |
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[V_ADD_U32_e32_]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF13]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF14]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF8]], [[DEF9]], [[DEF15]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -4994,21 +4994,21 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DS_READ_B128_gfx9_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -5134,7 +5134,6 @@ body:             |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   dead [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -5142,29 +5141,30 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_ADD_U32_e32_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF9]], [[DEF10]], [[DEF15]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF15]].sub0, [[DEF12]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
index 40f87e838d314..ab5a5cfd345a4 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -55,7 +55,7 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
@@ -65,13 +65,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -86,7 +86,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -239,7 +239,7 @@ body:             |
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
@@ -249,13 +249,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:areg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -270,7 +270,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)

>From 6cf8df8f3ab8ff04501720db942cf3cd3df13331 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 20 Jan 2026 15:35:36 -0600
Subject: [PATCH 34/35] Restore original set/reset of opcode, etc ...

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 38 ++++++++++---------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  5 ---
 .../AMDGPU/sched_mfma_rewrite_copies.mir      |  4 +-
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ad4c36af3dcf8..15e110c26aa7c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1323,10 +1323,8 @@ bool RewriteMFMAFormStage::initGCNSchedStage() {
 
   // If we haven't found the beneficial conditions, prefer the VGPR form which
   // may result in less cross RC copies.
-  if (Cost > 0) {
-    restoreRegClasses(RewriteCands);
+  if (Cost > 0)
     return false;
-  }
 
   return rewrite(RewriteCands);
 }
@@ -1975,6 +1973,7 @@ bool RewriteMFMAFormStage::initHeuristics(
       assert(ReplacementOp != -1);
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
+      MI.setDesc(TII->get(ReplacementOp));
 
       MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
       if (Src2->isReg()) {
@@ -1985,7 +1984,7 @@ bool RewriteMFMAFormStage::initHeuristics(
         // insert a copy.
         for (SlotIndex RDIdx : Src2ReachingDefs) {
           MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
-          if (!isRewriteCandidate(RD))
+          if (!TII->isMAI(*RD))
             CopyForDef.insert(RD);
         }
       }
@@ -2056,7 +2055,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     GCNRegPressure &PressureBefore = DAG.Pressure[Region];
     unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
-
+ 
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
@@ -2065,7 +2064,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
 
-    uint64_t BlockFreq =
+     uint64_t BlockFreq =
         MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
 
@@ -2096,7 +2095,6 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
   // Set the cost to the largest decrease in spill cost in order to not double
   // count spill reductions.
   Cost = BestSpillCost;
-
   assert(Cost <= 0);
 
   unsigned CopyCost = 0;
@@ -2124,21 +2122,27 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
       CopyCost += RC->getCopyCost() * UseFreq;
     }
   }
-
-  return Cost + CopyCost;
-}
-
-void RewriteMFMAFormStage::restoreRegClasses(
-    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  
+  // Reset the classes that were changed to AGPR for better RB analysis.
+  // We must do rewriting after copy-insertion, as some defs of the register 
+  // may require VGPR.  Additionally, if we bail out and don't perform the
+  // rewrite then these need to be restored anyway.
   for (auto &[MI, OriginalOpcode] : RewriteCands) {
-    MachineOperand &Dst = MI->getOperand(0);
-    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
-    const TargetRegisterClass *AGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+    assert(TII->isMAI(*MI));
+    const TargetRegisterClass *AGPRRC =
+        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
     const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
-    DAG.MRI.setRegClass(Dst.getReg(), VGPRRC);
+
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    assert(Src2);
+
     if (Src2->isReg())
       DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(OriginalOpcode));
   }
+
+  return Cost + CopyCost;
 }
 
 bool RewriteMFMAFormStage::rewrite(
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 37149c841f09f..0a79da061ab8e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -439,11 +439,6 @@ class RewriteMFMAFormStage : public GCNSchedStage {
                  DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
                  SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
-  /// Restore the register classes speculatively rewritten by initHueristics if
-  /// we exit without rewriting the instructions permanently.
-  void restoreRegClasses(
-      const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
-
   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
   /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index 1b41b61a9b57d..44e5563ce7adb 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -2447,7 +2447,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
@@ -2577,7 +2577,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF

>From 5c173d85ed68315a8ec6a2f84ceab931330b89fa Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Wed, 21 Jan 2026 13:47:49 -0600
Subject: [PATCH 35/35] Fix formatting errors.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 15e110c26aa7c..ebc90eeb9e3f2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2055,7 +2055,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     GCNRegPressure &PressureBefore = DAG.Pressure[Region];
     unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
- 
+
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
@@ -2064,7 +2064,7 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
     unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
         MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
 
-     uint64_t BlockFreq =
+    uint64_t BlockFreq =
         MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
 
@@ -2122,9 +2122,9 @@ int64_t RewriteMFMAFormStage::getRewriteCost(
       CopyCost += RC->getCopyCost() * UseFreq;
     }
   }
-  
+
   // Reset the classes that were changed to AGPR for better RB analysis.
-  // We must do rewriting after copy-insertion, as some defs of the register 
+  // We must do rewriting after copy-insertion, as some defs of the register
   // may require VGPR.  Additionally, if we bail out and don't perform the
   // rewrite then these need to be restored anyway.
   for (auto &[MI, OriginalOpcode] : RewriteCands) {



More information about the llvm-commits mailing list