[llvm-branch-commits] [llvm] [AMDGPU] Add MemoryPipeline scheduling to Coexec sched (PR #192325)

Jeffrey Byrnes via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu May 28 20:21:53 PDT 2026


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/192325

>From 50b341fc95903076d8283116982817d11eacdcac Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 20 Mar 2026 14:58:27 -0700
Subject: [PATCH 1/7] [AMDGPU] Add MemoryPipeline scheduling to Coexec sched

Change-Id: I52c476834155823d1ba998cdbbcb3ad6a7e6f2f5
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      |  99 ++++--
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h |  18 +
 .../AMDGPU/coexec-sched-effective-stall.mir   | 323 ++++++++++++++++++
 3 files changed, 418 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 9abf26e21a256..038072c096e3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -68,7 +68,7 @@ InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
   // Check for specific opcodes first.
   if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
       Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
-      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM || SII.isWaitcnt(Opc))
     return InstructionFlavor::Fence;
 
   if (SII.isLDSDMA(MI))
@@ -456,20 +456,21 @@ unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
 }
 
 bool CandidateHeuristics::tryEffectiveStall(
-    GenericSchedulerBase::SchedCandidate &Cand,
-    GenericSchedulerBase::SchedCandidate &TryCand, SchedBoundary &Zone) {
-
-  // Treat structural and latency stalls as a single scheduling cost for the
-  // current cycle.
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) {
+  
+  // Treat stalls as a single scheduling cost for the current cycle.
   struct StallCosts {
     unsigned Ready = 0;
     unsigned Structural = 0;
     unsigned Latency = 0;
-    unsigned Effective = 0;
     unsigned Carried = 0;
     unsigned Buffer = 0;
+    unsigned Fence = 0;
+    unsigned Effective = 0;
   };
-
+  
+  
   auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
     InstructionFlavor Flavor = classifyFlavor(
         *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
@@ -490,6 +491,32 @@ bool CandidateHeuristics::tryEffectiveStall(
   };
 
   unsigned CurrCycle = Zone.getCurrCycle();
+
+  auto getFenceStalls = [this, &CurrCycle](SUnit *SU) -> unsigned {
+    InstructionFlavor Flavor = classifyFlavor(
+        *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
+
+    if (Flavor != InstructionFlavor::Fence)
+      return 0;
+
+    HardwareUnitInfo *FenceHWUI = getHWUIFromFlavor(Flavor);
+    HardwareUnitInfo *DSHWUI = getHWUIFromFlavor(InstructionFlavor::DS);
+
+    SUnit *LastDS = DSHWUI->getLastScheduledSU();
+    if (!LastDS)
+      return 0;
+
+    SUnit *LastFence = FenceHWUI->getLastScheduledSU();
+    unsigned LastFenceCycle = LastFence ? LastFence->TopReadyCycle : 0;
+    unsigned LastDSCycle = LastDS->TopReadyCycle;
+
+    if (LastDSCycle < LastFenceCycle)
+      return 0;
+
+    unsigned LastDSFinish = LastDSCycle + getHWUICyclesForSU(LastDS);
+    return LastDSFinish <= CurrCycle ? 0 : LastDSFinish - CurrCycle;
+  };
+
   auto GetStallCosts = [&](SUnit *SU) {
     unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
     StallCosts Costs;
@@ -499,9 +526,9 @@ bool CandidateHeuristics::tryEffectiveStall(
     unsigned CarriedLatency = CarriedLatencies.lookup_or(SU->getInstr(), 0);
     Costs.Carried = CarriedLatency > CurrCycle ? CarriedLatency - CurrCycle : 0;
     Costs.Buffer = getBufferFullStalls(SU);
-
+    Costs.Fence = getFenceStalls(SU);
     Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency,
-                                Costs.Carried, Costs.Buffer});
+                                Costs.Carried, Costs.Buffer, Costs.Fence});
     return Costs;
   };
 
@@ -512,17 +539,44 @@ bool CandidateHeuristics::tryEffectiveStall(
     dbgs() << "Effective stalls: try=" << TryCosts.Effective
            << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
            << ", lat=" << TryCosts.Latency << ", carried=" << TryCosts.Carried
-           << ", buffer=" << TryCosts.Buffer << ") cand=" << CandCosts.Effective
-           << " (ready=" << CandCosts.Ready
-           << ", struct=" << CandCosts.Structural
-           << ", lat=" << CandCosts.Latency << ", carried=" << CandCosts.Carried
-           << ", buffer=" << CandCosts.Buffer << ")\n";
+           << ", buffer=" << TryCosts.Buffer << ", fence=" << TryCosts.Fence 
+           << ") cand=" << CandCosts.Effective << " (ready=" << CandCosts.Ready
+           << ", struct=" << CandCosts.Structural << ", lat=" << CandCosts.Latency 
+           << ", carried=" << CandCosts.Carried << ", buffer=" << CandCosts.Buffer 
+           << ", fence=" << CandCosts.Fence << ")\n";
   });
 
   return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand,
                  AMDGPUCoExecSchedStrategy::Stall);
 }
 
+bool CandidateHeuristics::tryMemoryPipeline(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand) {
+
+  InstructionFlavor TryFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
+
+  InstructionFlavor CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+
+  bool TryIsMemoryPipeline = TryFlavor == InstructionFlavor::DMA ||
+                             TryFlavor == InstructionFlavor::Fence;
+  bool CandIsMemoryPipeline = CandFlavor == InstructionFlavor::DMA ||
+                              CandFlavor == InstructionFlavor::Fence;
+
+  if (TryIsMemoryPipeline == CandIsMemoryPipeline)
+    return false;
+
+  if (CandIsMemoryPipeline) {
+    if (Cand.Reason > GenericSchedulerBase::RegCritical)
+      Cand.Reason = GenericSchedulerBase::RegCritical;
+
+    return true;
+  }
+
+  TryCand.Reason = GenericSchedulerBase::RegCritical;
+  return true;
+}
+
 bool CandidateHeuristics::tryCriticalResourceDependency(
     GenericSchedulerBase::SchedCandidate &TryCand,
     GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
@@ -856,8 +910,15 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
   if (SameBoundary) {
     // Compare candidates by the stall they would introduce if
     // scheduled in the current cycle.
-    if (Heurs.tryEffectiveStall(Cand, TryCand, *Zone))
+    if (Heurs.tryEffectiveStall(TryCand, Cand, *Zone)) {
+      LastAMDGPUReason = AMDGPUSchedReason::Stall;
+      return TryCand.Reason != NoCand;
+    }
+
+    if (Heurs.tryMemoryPipeline(TryCand, Cand)) {
+      LastAMDGPUReason = AMDGPUSchedReason::MemoryPipeline;
       return TryCand.Reason != NoCand;
+    }
 
     Heurs.sortHWUIResources();
     if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
@@ -902,12 +963,6 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
     return TryCand.Reason != NoCand;
 
   if (SameBoundary) {
-    // Avoid serializing long latency dependence chains.
-    // For acyclic path limited loops, latency was already checked above.
-    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
-        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
-      return TryCand.Reason != NoCand;
-
     // Fall through to original instruction order.
     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 177f77dde7562..4b1568c674300 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -127,6 +127,8 @@ inline FlavorGroup all() {
 /// than the generic CandReason enum for debugging purposes.
 enum class AMDGPUSchedReason : uint8_t {
   None,
+  Stall,
+  MemoryPipeline,
   CritResourceBalance, // tryCriticalResource chose based on resource pressure
   CritResourceDep,     // tryCriticalResourceDependency chose based on enabling
   NUM_REASONS
@@ -136,6 +138,10 @@ constexpr StringRef getReasonName(AMDGPUSchedReason R) {
   switch (R) {
   case AMDGPUSchedReason::None:
     return "None";
+  case AMDGPUSchedReason::Stall:
+    return "Stall";
+  case AMDGPUSchedReason::MemoryPipeline:
+    return "MemoryPipeline";
   case AMDGPUSchedReason::CritResourceBalance:
     return "CritResource";
   case AMDGPUSchedReason::CritResourceDep:
@@ -236,6 +242,15 @@ class HardwareUnitInfo {
            ScheduledSUs[ScheduledSUs.size() - BufferSize]->TopReadyCycle;
   }
 
+  /// \returns the most recently scheduled SU for this HardwareUnit.
+  SUnit *getLastScheduledSU() {
+    unsigned ScheduledCount = ScheduledSUs.size();
+    if (!ScheduledCount)
+      return nullptr;
+
+    return ScheduledSUs[ScheduledCount - 1];
+  }
+
   /// \returns the SUnit with higher priority or nullptr if they are the same.
   /// This method looks through the PrioritySUs to determine if one SU is more
   /// prioritized than the other. If neither are in the PrioritySUs list, then
@@ -349,6 +364,9 @@ class CandidateHeuristics {
                          GenericSchedulerBase::SchedCandidate &Cand,
                          SchedBoundary &Zone);
 
+  bool tryMemoryPipeline(GenericSchedulerBase::SchedCandidate &TryCand,
+                         GenericSchedulerBase::SchedCandidate &Cand);
+
   /// Check for critical resource consumption. Prefer the candidate that uses
   /// the most prioritized HardwareUnit. If both candidates use the same
   /// HarwareUnit, prefer the candidate with higher priority on that
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 0a6f2fe9375d5..7868c7dcf88a3 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -5,6 +5,9 @@
 --- |
   define void @test-sched-effective-stall() #0 { ret void }
   define void @test-sched-pending-structural-stall() #0 { ret void }
+  define void @test-fence-stall() #0 { ret void }
+  define void @test-tensorcnt-stall() #0 { ret void }
+  define void @test-dscnt-stall() #0 { ret void }
 
   attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
 ...
@@ -121,3 +124,323 @@ body: |
     S_NOP 0
     S_ENDPGM 0, implicit %10, implicit %11
 ...
+
+
+---
+name: test-fence-stall
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; DEFAULT-LABEL: name: test-fence-stall
+    ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+    ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+    ; DEFAULT-NEXT: ATOMIC_FENCE 4, 2
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    ;
+    ; COEXEC-LABEL: name: test-fence-stall
+    ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
+    ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    %95:vgpr_32 = IMPLICIT_DEF
+    %90:vreg_256_align2 = IMPLICIT_DEF
+    %91:vreg_256_align2 = IMPLICIT_DEF
+    %85:sreg_64_xexec = IMPLICIT_DEF
+    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %2373:vreg_512_align2 = IMPLICIT_DEF
+    %2369:vreg_512_align2 = IMPLICIT_DEF
+    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %8:vgpr_32_lo256 = IMPLICIT_DEF
+    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    ATOMIC_FENCE 4, 2
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0, amdgpu_allvgprs
+...
+
+
+---
+name: test-tensorcnt-stall
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; DEFAULT-LABEL: name: test-tensorcnt-stall
+    ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+    ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+    ; DEFAULT-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    ;
+    ; COEXEC-LABEL: name: test-tensorcnt-stall
+    ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    %95:vgpr_32 = IMPLICIT_DEF
+    %90:vreg_256_align2 = IMPLICIT_DEF
+    %91:vreg_256_align2 = IMPLICIT_DEF
+    %85:sreg_64_xexec = IMPLICIT_DEF
+    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %2373:vreg_512_align2 = IMPLICIT_DEF
+    %2369:vreg_512_align2 = IMPLICIT_DEF
+    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %8:vgpr_32_lo256 = IMPLICIT_DEF
+    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0, amdgpu_allvgprs
+...
+
+---
+name: test-dscnt-stall
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; DEFAULT-LABEL: name: test-dscnt-stall
+    ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+    ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+    ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+    ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+    ; DEFAULT-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    ;
+    ; COEXEC-LABEL: name: test-dscnt-stall
+    ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+    ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    %95:vgpr_32 = IMPLICIT_DEF
+    %90:vreg_256_align2 = IMPLICIT_DEF
+    %91:vreg_256_align2 = IMPLICIT_DEF
+    %85:sreg_64_xexec = IMPLICIT_DEF
+    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %2373:vreg_512_align2 = IMPLICIT_DEF
+    %2369:vreg_512_align2 = IMPLICIT_DEF
+    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %8:vgpr_32_lo256 = IMPLICIT_DEF
+    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0, amdgpu_allvgprs
+...

>From 6c87941a9d38de8cf36c276d0ed9ab31c3d176ae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 25 Mar 2026 18:22:24 -0700
Subject: [PATCH 2/7] Add a comment

Change-Id: I447f7f1fb185b18924cfd98249b5a0a05fef2484
---
 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 4b1568c674300..6e056a9554afa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -364,6 +364,13 @@ class CandidateHeuristics {
                          GenericSchedulerBase::SchedCandidate &Cand,
                          SchedBoundary &Zone);
 
+  /// Prioritize instructions involved the memory pipeline. Currently we don't have
+  /// any modelling of pipelined loads, so we control the layout of the pipeline
+  /// per iteration by giving the user some control over the stalls (e.g. between
+  /// s_barrier_signal and s_barrier_wait) and scheduling the pipeline instructions
+  /// as soon as they are ready.
+  ///
+  /// TODO -- add better modelling and heuristics for pipelining based scheduling.
   bool tryMemoryPipeline(GenericSchedulerBase::SchedCandidate &TryCand,
                          GenericSchedulerBase::SchedCandidate &Cand);
 

>From 88817729b47781743ea13769592cb00ab5df72a2 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:04:56 -0700
Subject: [PATCH 3/7] Make fence heuristic work bottom-up

Change-Id: I629cbc8905b87a962e8b123287e5f60a3154df6b
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 41 ++++++++++---------
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 13 +++---
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 038072c096e3c..c1b45342b6d77 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -422,8 +422,7 @@ void CandidateHeuristics::sortHWUIResources() {
 
 unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
                                                        SUnit *SU) {
-  // Only implemented for top-down scheduling currently.
-  if (!Zone.isTop() || !SU)
+  if (!SU)
     return 0;
 
   MachineInstr *MI = SU->getInstr();
@@ -458,7 +457,10 @@ unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
 bool CandidateHeuristics::tryEffectiveStall(
     GenericSchedulerBase::SchedCandidate &TryCand,
     GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) {
-  
+  // Only implemented for top-down scheduling
+  if (!Zone.isTop())
+    return 0;
+
   // Treat stalls as a single scheduling cost for the current cycle.
   struct StallCosts {
     unsigned Ready = 0;
@@ -469,8 +471,7 @@ bool CandidateHeuristics::tryEffectiveStall(
     unsigned Fence = 0;
     unsigned Effective = 0;
   };
-  
-  
+
   auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
     InstructionFlavor Flavor = classifyFlavor(
         *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
@@ -480,8 +481,6 @@ bool CandidateHeuristics::tryEffectiveStall(
     if (HWUI->getBufferSize() == 0)
       return 0;
 
-    // getBufferAvailableCycle assumes top-down scheduling.
-    assert(Zone.isTop());
     unsigned CurrCycle = Zone.getCurrCycle();
     unsigned BufferReadyCycle = HWUI->getBufferAvailableCycle(CurrCycle);
     if (BufferReadyCycle <= CurrCycle)
@@ -492,29 +491,33 @@ bool CandidateHeuristics::tryEffectiveStall(
 
   unsigned CurrCycle = Zone.getCurrCycle();
 
-  auto getFenceStalls = [this, &CurrCycle](SUnit *SU) -> unsigned {
+  auto getFenceStalls = [this, &CurrCycle, &Zone](SUnit *SU) -> unsigned {
     InstructionFlavor Flavor = classifyFlavor(
         *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
 
-    if (Flavor != InstructionFlavor::Fence)
+    bool IsTop = Zone.isTop();
+    if ((Flavor != InstructionFlavor::Fence && IsTop) ||
+        (Flavor != InstructionFlavor::DS && !IsTop))
       return 0;
 
-    HardwareUnitInfo *FenceHWUI = getHWUIFromFlavor(Flavor);
-    HardwareUnitInfo *DSHWUI = getHWUIFromFlavor(InstructionFlavor::DS);
+    HardwareUnitInfo *ConsumerHWUI = getHWUIFromFlavor(Flavor);
+    HardwareUnitInfo *ProducerHWUI = getHWUIFromFlavor(
+        IsTop ? InstructionFlavor::DS : InstructionFlavor::Fence);
 
-    SUnit *LastDS = DSHWUI->getLastScheduledSU();
-    if (!LastDS)
+    SUnit *LastProducer = ProducerHWUI->getLastScheduledSU();
+    if (!LastProducer)
       return 0;
 
-    SUnit *LastFence = FenceHWUI->getLastScheduledSU();
-    unsigned LastFenceCycle = LastFence ? LastFence->TopReadyCycle : 0;
-    unsigned LastDSCycle = LastDS->TopReadyCycle;
+    SUnit *LastConsumer = ConsumerHWUI->getLastScheduledSU();
+    unsigned LastConsumerCycle = LastConsumer ? LastConsumer->TopReadyCycle : 0;
+    unsigned LastProducerCycle = LastProducer->TopReadyCycle;
 
-    if (LastDSCycle < LastFenceCycle)
+    if (LastProducerCycle < LastConsumerCycle)
       return 0;
 
-    unsigned LastDSFinish = LastDSCycle + getHWUICyclesForSU(LastDS);
-    return LastDSFinish <= CurrCycle ? 0 : LastDSFinish - CurrCycle;
+    unsigned FenceStallFinish =
+        LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
+    return FenceStallFinish <= CurrCycle ? 0 : FenceStallFinish - CurrCycle;
   };
 
   auto GetStallCosts = [&](SUnit *SU) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 6e056a9554afa..b69b6c0446b2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -364,13 +364,14 @@ class CandidateHeuristics {
                          GenericSchedulerBase::SchedCandidate &Cand,
                          SchedBoundary &Zone);
 
-  /// Prioritize instructions involved the memory pipeline. Currently we don't have
-  /// any modelling of pipelined loads, so we control the layout of the pipeline
-  /// per iteration by giving the user some control over the stalls (e.g. between
-  /// s_barrier_signal and s_barrier_wait) and scheduling the pipeline instructions
-  /// as soon as they are ready.
+  /// Prioritize instructions involved the memory pipeline. Currently we don't
+  /// have any modelling of pipelined loads, so we control the layout of the
+  /// pipeline per iteration by giving the user some control over the stalls
+  /// (e.g. between s_barrier_signal and s_barrier_wait) and scheduling the
+  /// pipeline instructions as soon as they are ready.
   ///
-  /// TODO -- add better modelling and heuristics for pipelining based scheduling.
+  /// TODO -- add better modelling and heuristics for pipelining based
+  /// scheduling.
   bool tryMemoryPipeline(GenericSchedulerBase::SchedCandidate &TryCand,
                          GenericSchedulerBase::SchedCandidate &Cand);
 

>From 646ff7544bc6aec14f033169c8498d0468198ab4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:08:36 -0700
Subject: [PATCH 4/7] Add back tryLatency

Change-Id: I12d4f255c48ed77ba927eb3b192e5903f1f5e24f
---
 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index c1b45342b6d77..11881141392f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -966,6 +966,12 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
     return TryCand.Reason != NoCand;
 
   if (SameBoundary) {
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
     // Fall through to original instruction order.
     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {

>From ddef328eea399aa8993e1824e5765984e957035d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:11:50 -0700
Subject: [PATCH 5/7] Add comment

Change-Id: I2180bba631fe4a01ed3c3fbcfa8c19cbefa84133
---
 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 11881141392f9..1a28bc9b4b7d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -515,6 +515,7 @@ bool CandidateHeuristics::tryEffectiveStall(
     if (LastProducerCycle < LastConsumerCycle)
       return 0;
 
+    // Latency comes from DS regardless of bottom-up / top-down.
     unsigned FenceStallFinish =
         LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
     return FenceStallFinish <= CurrCycle ? 0 : FenceStallFinish - CurrCycle;

>From a3ec9c389ce3df89f9bcd062a0f0e10bad6d79fc Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 15 Apr 2026 16:14:27 -0700
Subject: [PATCH 6/7] Adrress comments from
 https://github.com/llvm/llvm-project/pull/188658

Change-Id: Ia94c567a753168c1ffa16dc5d91195e7dd0ba044
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      |   6 +-
 .../AMDGPU/coexec-sched-effective-stall.mir   | 222 +++++++++---------
 2 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 1a28bc9b4b7d7..3d9de536e0d28 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -728,9 +728,9 @@ void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
                                            MachineBasicBlock::iterator End,
                                            unsigned NumRegionInstrs) {
   GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
-  assert((PreRADirection == MISched::Unspecified ||
-          PreRADirection == MISched::TopDown) &&
-         "coexec scheduler only supports top-down scheduling");
+  if (PreRADirection == MISched::BottomUp ||
+      PreRADirection == MISched::Bidirectional)
+    report_fatal_error("CoExecSchedStrategy only support TopDown scheduling.");
   RegionPolicy.OnlyTopDown = true;
   RegionPolicy.OnlyBottomUp = false;
   RegionPolicy.ShouldTrackLaneMasks = true;
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 7868c7dcf88a3..772eb91d44c59 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -136,14 +136,14 @@ body: |
     ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -162,7 +162,7 @@ body: |
     ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
     ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
     ; DEFAULT-NEXT: ATOMIC_FENCE 4, 2
-    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     ;
     ; COEXEC-LABEL: name: test-fence-stall
@@ -191,44 +191,44 @@ body: |
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
-    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
     ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
-    %95:vgpr_32 = IMPLICIT_DEF
-    %90:vreg_256_align2 = IMPLICIT_DEF
-    %91:vreg_256_align2 = IMPLICIT_DEF
-    %85:sreg_64_xexec = IMPLICIT_DEF
-    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
-    %2373:vreg_512_align2 = IMPLICIT_DEF
-    %2369:vreg_512_align2 = IMPLICIT_DEF
-    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_256_align2 = IMPLICIT_DEF
+    %2:vreg_256_align2 = IMPLICIT_DEF
+    %3:sreg_64_xexec = IMPLICIT_DEF
+    %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %5:vreg_512_align2 = IMPLICIT_DEF
+    %6:vreg_512_align2 = IMPLICIT_DEF
+    %7:vreg_128_lo256_align2 = IMPLICIT_DEF
     %8:vgpr_32_lo256 = IMPLICIT_DEF
-    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
-    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
-    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
-    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
-    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
-    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
-    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
-    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
-    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
-    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
-    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
-    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
-    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
-    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
-    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
-    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
-    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+    %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+    undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+    %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+    undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+    %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+    undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+    %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+    undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+    %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+    undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+    %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+    undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+    %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+    %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+    %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
     ATOMIC_FENCE 4, 2
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0, amdgpu_allvgprs
 ...
 
@@ -243,14 +243,14 @@ body: |
     ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -269,7 +269,7 @@ body: |
     ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
     ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
     ; DEFAULT-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
-    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     ;
     ; COEXEC-LABEL: name: test-tensorcnt-stall
@@ -298,44 +298,44 @@ body: |
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
-    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
     ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
-    %95:vgpr_32 = IMPLICIT_DEF
-    %90:vreg_256_align2 = IMPLICIT_DEF
-    %91:vreg_256_align2 = IMPLICIT_DEF
-    %85:sreg_64_xexec = IMPLICIT_DEF
-    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
-    %2373:vreg_512_align2 = IMPLICIT_DEF
-    %2369:vreg_512_align2 = IMPLICIT_DEF
-    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_256_align2 = IMPLICIT_DEF
+    %2:vreg_256_align2 = IMPLICIT_DEF
+    %3:sreg_64_xexec = IMPLICIT_DEF
+    %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %5:vreg_512_align2 = IMPLICIT_DEF
+    %6:vreg_512_align2 = IMPLICIT_DEF
+    %7:vreg_128_lo256_align2 = IMPLICIT_DEF
     %8:vgpr_32_lo256 = IMPLICIT_DEF
-    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
-    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
-    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
-    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
-    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
-    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
-    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
-    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
-    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
-    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
-    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
-    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
-    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
-    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
-    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
-    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
-    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+    %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+    undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+    %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+    undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+    %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+    undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+    %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+    undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+    %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+    undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+    %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+    undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+    %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+    %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+    %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
     S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0, amdgpu_allvgprs
 ...
 
@@ -349,14 +349,14 @@ body: |
     ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
-    ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
     ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
     ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -375,7 +375,7 @@ body: |
     ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
     ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
     ; DEFAULT-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
-    ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     ;
     ; COEXEC-LABEL: name: test-dscnt-stall
@@ -404,43 +404,43 @@ body: |
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
-    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
     ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+    ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
-    %95:vgpr_32 = IMPLICIT_DEF
-    %90:vreg_256_align2 = IMPLICIT_DEF
-    %91:vreg_256_align2 = IMPLICIT_DEF
-    %85:sreg_64_xexec = IMPLICIT_DEF
-    %249:sreg_32_xm0_xexec = IMPLICIT_DEF
-    %2373:vreg_512_align2 = IMPLICIT_DEF
-    %2369:vreg_512_align2 = IMPLICIT_DEF
-    %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_256_align2 = IMPLICIT_DEF
+    %2:vreg_256_align2 = IMPLICIT_DEF
+    %3:sreg_64_xexec = IMPLICIT_DEF
+    %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %5:vreg_512_align2 = IMPLICIT_DEF
+    %6:vreg_512_align2 = IMPLICIT_DEF
+    %7:vreg_128_lo256_align2 = IMPLICIT_DEF
     %8:vgpr_32_lo256 = IMPLICIT_DEF
-    undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
-    %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
-    undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
-    %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
-    undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
-    %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
-    undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
-    %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
-    undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
-    %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
-    undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
-    %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
-    undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
-    %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
-    %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
-    undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
-    %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+    undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+    %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+    undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+    %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+    undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+    %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+    undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+    %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+    undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+    %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+    undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+    %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+    undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+    %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+    %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+    undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+    %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
     S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0, amdgpu_allvgprs
 ...

>From d858addf10443df323b5b2920419d1eb08129708 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 28 May 2026 15:38:18 -0700
Subject: [PATCH 7/7] Update tests

Change-Id: Ided5a4e6968d4670d1fd09bcb327a75a25273d4e
---
 .../AMDGPU/coexec-block-carried-latency.mir   |  4 +-
 .../AMDGPU/coexec-sched-effective-stall.mir   | 48 +++++++++----------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/coexec-block-carried-latency.mir b/llvm/test/CodeGen/AMDGPU/coexec-block-carried-latency.mir
index 289a59fd1f032..0aa0ef4f32e14 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-block-carried-latency.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-block-carried-latency.mir
@@ -33,6 +33,7 @@ body: |
   ; DEFAULT-NEXT: bb.1:
   ; DEFAULT-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1
   ; DEFAULT-NEXT: {{  $}}
+  ; DEFAULT-NEXT:   ATOMIC_FENCE 5, 2
   ; DEFAULT-NEXT:   early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DS_READ_B32_gfx9_]], [[DS_READ_B32_gfx9_1]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; DEFAULT-NEXT:   dead [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 $sgpr0, $sgpr1, implicit-def dead $scc
   ; DEFAULT-NEXT:   dead [[S_ADD_I32_1:%[0-9]+]]:sgpr_32 = S_ADD_I32 $sgpr0, $sgpr1, implicit-def dead $scc
@@ -51,7 +52,6 @@ body: |
   ; DEFAULT-NEXT:   dead [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
   ; DEFAULT-NEXT:   early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; DEFAULT-NEXT:   dead [[S_ADD_I32_9:%[0-9]+]]:sgpr_32 = S_ADD_I32 $sgpr0, $sgpr1, implicit-def dead $scc
-  ; DEFAULT-NEXT:   ATOMIC_FENCE 5, 2
   ; DEFAULT-NEXT:   dead early-clobber %28:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; DEFAULT-NEXT:   dead early-clobber %29:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; DEFAULT-NEXT:   dead early-clobber %30:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -141,8 +141,8 @@ body: |
   ; ALL-NEXT:   dead [[S_ADD_I32_8:%[0-9]+]]:sgpr_32 = S_ADD_I32 $sgpr0, $sgpr1, implicit-def dead $scc
   ; ALL-NEXT:   dead [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
   ; ALL-NEXT:   dead [[S_ADD_I32_9:%[0-9]+]]:sgpr_32 = S_ADD_I32 $sgpr0, $sgpr1, implicit-def dead $scc
-  ; ALL-NEXT:   early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DS_READ_B32_gfx9_]], [[DS_READ_B32_gfx9_1]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; ALL-NEXT:   ATOMIC_FENCE 5, 2
+  ; ALL-NEXT:   early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DS_READ_B32_gfx9_]], [[DS_READ_B32_gfx9_1]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; ALL-NEXT:   early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; ALL-NEXT:   dead early-clobber %28:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
   ; ALL-NEXT:   dead early-clobber %29:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF3]], [[DEF4]], 0, [[DEF5]], [[DS_READ_B32_gfx9_2]], [[DS_READ_B32_gfx9_3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 772eb91d44c59..210eaf1dd4128 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -168,21 +168,21 @@ body: |
     ; COEXEC-LABEL: name: test-fence-stall
     ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -275,21 +275,21 @@ body: |
     ; COEXEC-LABEL: name: test-tensorcnt-stall
     ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -381,21 +381,21 @@ body: |
     ; COEXEC-LABEL: name: test-dscnt-stall
     ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
     ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
-    ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+    ; COEXEC-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
     ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF



More information about the llvm-branch-commits mailing list