[llvm-branch-commits] [llvm] [AMDGPU] Add HWUI pressure heuristics to coexec strategy (PR #184929)

Thu Mar 5 17:58:15 PST 2026

https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/184929

>From e9606df17e12373e8c75246e7e3f6a72deb0ecca Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 4 Mar 2026 07:36:18 -0800
Subject: [PATCH 1/2] [AMDGPU] Add HWUI pressure heuristics to coexec strategy

Change-Id: I322cc670c8d923a6df23588d8a14cdaec1f49da9
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 435 ++++++++++++-
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 286 ++++++++-
 .../AMDGPU/coexec-sched-effective-stall.mir   |   8 +-
 llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll  | 601 ++++++++++++++++++
 4 files changed, 1302 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 1f1035b85956e..cec06ff514697 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -41,6 +41,368 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
   return OnlyChoice;
 }
 
+InstructionFlavor llvm::classifyFlavor(const MachineInstr *MI,
+                                       const SIInstrInfo *SII) {
+  if (!MI || MI->isDebugInstr())
+    return InstructionFlavor::Other;
+
+  unsigned Opc = MI->getOpcode();
+
+  // Check for specific opcodes first.
+  if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
+      Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
+      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+    return InstructionFlavor::Fence;
+
+  if ((SII->isFLAT(*MI) || SII->isFLATGlobal(*MI)) && SII->isDS(*MI))
+    return InstructionFlavor::DMA;
+
+  if (SII->isMFMAorWMMA(*MI))
+    return InstructionFlavor::WMMA;
+
+  if (SII->isTRANS(*MI))
+    return InstructionFlavor::TRANS;
+
+  if (SII->isVALU(*MI))
+    return InstructionFlavor::SingleCycleVALU;
+
+  if (SII->isDS(*MI))
+    return InstructionFlavor::DS;
+
+  if (SII->isFLAT(*MI) || SII->isFLATGlobal(*MI) || SII->isFLATScratch(*MI))
+    return InstructionFlavor::VMEM;
+
+  if (SII->isSALU(*MI))
+    return InstructionFlavor::SALU;
+
+  return InstructionFlavor::Other;
+}
+
+SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) {
+  for (auto *PrioritySU : PrioritySUs) {
+    if (!PrioritySU->isTopReady())
+      return PrioritySU;
+  }
+
+  if (!LookDeep)
+    return nullptr;
+
+  // TODO -- we may want to think about more advance strategies here.
+  unsigned MinDepth = std::numeric_limits<unsigned int>::max();
+  SUnit *TargetSU = nullptr;
+  for (auto *SU : AllSUs) {
+    if (SU->isScheduled)
+      continue;
+
+    if (SU->isTopReady())
+      continue;
+
+    if (SU->getDepth() < MinDepth) {
+      MinDepth = SU->getDepth();
+      TargetSU = SU;
+    }
+  }
+  return TargetSU;
+}
+
+void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
+  bool Inserted = AllSUs.insert(SU);
+  TotalCycles += BlockingCycles;
+
+  assert(Inserted);
+  if (PrioritySUs.empty()) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+  unsigned SUDepth = SU->getDepth();
+  unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+  if (SUDepth > CurrDepth)
+    return;
+
+  if (SUDepth == CurrDepth) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+
+  // SU is lower depth and should be prioritized.
+  PrioritySUs.clear();
+  PrioritySUs.insert(SU);
+}
+
+void HardwareUnitInfo::schedule(SUnit *SU, unsigned BlockingCycles) {
+  // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
+  // we just clear the HWUI. However, we still have instructions which map to
+  // this HWUI. Don't bother managing the state for these HWUI.
+  if (TotalCycles == 0)
+    return;
+
+  AllSUs.remove(SU);
+  PrioritySUs.remove(SU);
+
+  TotalCycles -= BlockingCycles;
+
+  if (AllSUs.empty())
+    return;
+  if (PrioritySUs.empty()) {
+    for (auto SU : AllSUs) {
+      if (PrioritySUs.empty()) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+      unsigned SUDepth = SU->getDepth();
+      unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+      if (SUDepth > CurrDepth)
+        continue;
+
+      if (SUDepth == CurrDepth) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+
+      // SU is lower depth and should be prioritized.
+      PrioritySUs.clear();
+      PrioritySUs.insert(SU);
+    }
+  }
+}
+
+HardwareUnitInfo *
+CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
+  for (auto &HWUICand : HWUInfo) {
+    if (HWUICand.getType() == Flavor) {
+      return &HWUICand;
+    }
+  }
+  return nullptr;
+}
+
+unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
+  if (SchedModel && SchedModel->hasInstrSchedModel()) {
+    unsigned ReleaseAtCycle = 0;
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter
+             PI = SchedModel->getWriteProcResBegin(SC),
+             PE = SchedModel->getWriteProcResEnd(SC);
+         PI != PE; ++PI) {
+      ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
+    }
+    return ReleaseAtCycle;
+  }
+  return -1;
+}
+
+void CandidateHeuristics::schedNode(SUnit *SU) {
+  HardwareUnitInfo *HWUI =
+      getHWUIFromFlavor(classifyFlavor(SU->getInstr(), SII));
+  HWUI->schedule(SU, getHWUICyclesForInst(SU));
+}
+
+void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
+                                     const TargetSchedModel *TargetSchedModel,
+                                     const TargetRegisterInfo *TRI) {
+  DAG = SchedDAG;
+  SchedModel = TargetSchedModel;
+
+  SRI = static_cast<const SIRegisterInfo *>(TRI);
+  SII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+  HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HWUInfo[I].setType(I);
+    HWUInfo[I].reset();
+  }
+
+  HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+
+  collectHWUIPressure();
+}
+
+void CandidateHeuristics::collectHWUIPressure() {
+  if (!SchedModel || !SchedModel->hasInstrSchedModel())
+    return;
+
+  for (auto &SU : DAG->SUnits) {
+    InstructionFlavor Flavor = classifyFlavor(SU.getInstr(), SII);
+    HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
+  }
+
+  LLVM_DEBUG(dumpRegionSummary());
+}
+
+void CandidateHeuristics::dumpRegionSummary() {
+  MachineBasicBlock *BB = DAG->begin()->getParent();
+  dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
+         << " (" << DAG->SUnits.size() << " SUs) ===\n";
+
+  dbgs() << "\nHWUI Resource Pressure:\n";
+  for (auto &HWUI : HWUInfo) {
+    if (HWUI.getTotalCycles() == 0)
+      continue;
+
+    StringRef Name = getFlavorName(HWUI.getType());
+    dbgs() << "  [" << HWUI.getIdx() << "] " << Name << ": "
+           << HWUI.getTotalCycles() << " cycles, " << HWUI.size()
+           << " instrs\n";
+  }
+  dbgs() << "\n";
+}
+
+void CandidateHeuristics::sortHWUIResources() {
+  // Highest priority should be first.
+  llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
+    // Prefer CoexecWindow producers
+    if (A.producesCoexecWindow() != B.producesCoexecWindow())
+      return A.producesCoexecWindow();
+
+    // Prefer more demanded resources
+    if (A.getTotalCycles() != B.getTotalCycles())
+      return A.getTotalCycles() > B.getTotalCycles();
+
+    // In ties -- prefer the resource with longer latency instructions
+    if (A.size() != B.size())
+      return A.size() < B.size();
+
+    // Default to HardwareUnitInfo order
+    return A.getIdx() < B.getIdx();
+  });
+}
+
+bool CandidateHeuristics::tryCriticalResourceDependency(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+
+  auto IsCandidateResource = [this, &Cand, &TryCand](unsigned ResourceIdx) {
+    HardwareUnitInfo HWUI = HWUInfo[ResourceIdx];
+
+    auto CandFlavor = classifyFlavor(Cand.SU->getInstr(), SII);
+    auto TryCandFlavor = classifyFlavor(TryCand.SU->getInstr(), SII);
+    bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
+                     TryCandFlavor == InstructionFlavor::DS) &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    // If we do not have a TargetSU for this resource, then it is not critical.
+    if (!TargetSU)
+      return false;
+
+    return true;
+  };
+
+  auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
+    HardwareUnitInfo HWUI = HWUInfo[ResourceIdx];
+    auto CandFlavor = classifyFlavor(Cand.SU->getInstr(), SII);
+
+    // We want to ensure our DS order matches WMMA order.
+    bool LookDeep = CandFlavor == InstructionFlavor::DS &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    bool CandEnables =
+        TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
+    bool TryCandEnables =
+        TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
+
+    if (!CandEnables && !TryCandEnables)
+      return false;
+
+    if (CandEnables && !TryCandEnables) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (!CandEnables && TryCandEnables) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Both enable, prefer the critical path.
+    bool CandHeight = Cand.SU->getHeight();
+    bool TryCandHeight = TryCand.SU->getHeight();
+
+    if (CandHeight > TryCandHeight) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (CandHeight < TryCandHeight) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Same critical path, just prefer original candidate.
+    if (Cand.Reason > GenericSchedulerBase::RegCritical)
+      Cand.Reason = GenericSchedulerBase::RegCritical;
+
+    return true;
+  };
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    // If we have encountered a resource that is not critical, then neither
+    // candidate enables a critical resource
+    if (!IsCandidateResource(I))
+      return false;
+
+    bool Enabled = TryEnablesResource(I);
+    // If neither has enabled the resource, continue to the next resource
+    if (Enabled)
+      return true;
+  }
+  return false;
+}
+
+bool CandidateHeuristics::tryCriticalResource(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HardwareUnitInfo HWUI = HWUInfo[I];
+
+    bool CandUsesCrit = HWUI.contains(Cand.SU);
+    bool TryCandUsesCrit = HWUI.contains(TryCand.SU);
+
+    if (!CandUsesCrit && !TryCandUsesCrit)
+      continue;
+
+    if (CandUsesCrit != TryCandUsesCrit) {
+      if (CandUsesCrit) {
+        if (Cand.Reason > GenericSchedulerBase::RegCritical)
+          Cand.Reason = GenericSchedulerBase::RegCritical;
+        return true;
+      }
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Otherwise, both use the critical resource
+    // For longer latency InstructionFlavors, we should prioritize first by
+    // their enablement of critical resources
+    if (HWUI.getType() == InstructionFlavor::DS) {
+      if (tryCriticalResourceDependency(TryCand, Cand, Zone))
+        return true;
+    }
+
+    // Prioritize based on HWUI priorities.
+    SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
+    if (Match) {
+      if (Match == Cand.SU) {
+        if (Cand.Reason > GenericSchedulerBase::RegCritical)
+          Cand.Reason = GenericSchedulerBase::RegCritical;
+        return true;
+      }
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
     const MachineSchedContext *C)
     : GCNSchedStrategy(C) {
@@ -60,6 +422,12 @@ void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   RegionPolicy.OnlyBottomUp = false;
 
   GCNSchedStrategy::initialize(DAG);
+  Heurs.initialize(DAG, SchedModel, TRI);
+}
+
+void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  Heurs.schedNode(SU);
+  GCNSchedStrategy::schedNode(SU, IsTopNode);
 }
 
 SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
@@ -74,6 +442,7 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
 
   bool PickedPending = false;
   SUnit *SU = nullptr;
+  SchedCandidate *PickedCand = nullptr;
   do {
     PickedPending = false;
     SU = pickOnlyChoice(Top);
@@ -84,10 +453,13 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
                         PickedPending, /*IsBottomUp=*/false);
       assert(TopCand.Reason != NoCand && "failed to find a candidate");
       SU = TopCand.SU;
+      PickedCand = &TopCand;
     }
     IsTopNode = true;
   } while (SU->isScheduled);
 
+  LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
+
   if (PickedPending) {
     unsigned ReadyCycle = SU->TopReadyCycle;
     unsigned CurrentCycle = Top.getCurrCycle();
@@ -141,7 +513,7 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
       initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
                     VGPRPressure, IsBottomUp);
       SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
-      tryCandidate(Cand, TryCand, ZoneArg);
+      tryCandidateCoexec(Cand, TryCand, ZoneArg);
       if (TryCand.Reason != NoCand) {
         if (TryCand.ResDelta == SchedResourceDelta())
           TryCand.initResourceDelta(Zone.DAG, SchedModel);
@@ -161,9 +533,34 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
   EvaluateQueue(Zone.Pending, /*FromPending=*/true);
 }
 
-bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
-                                             SchedCandidate &TryCand,
-                                             SchedBoundary *Zone) const {
+void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
+                                                SchedCandidate &Cand) {
+  const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
+  unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
+
+  dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
+
+  InstructionFlavor Flavor = classifyFlavor(SU->getInstr(), SII);
+  dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
+  SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
+                        /*SkipDebugLoc=*/true);
+  dbgs() << " [" << getFlavorName(Flavor) << "]\n";
+
+  dbgs() << "  Reason: ";
+  if (LastAMDGPUReason != AMDGPUSchedReason::None)
+    dbgs() << getReasonName(LastAMDGPUReason);
+  else if (Cand.Reason != NoCand)
+    dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
+  else
+    dbgs() << "Unknown";
+  dbgs() << "\n\n";
+
+  LastAMDGPUReason = AMDGPUSchedReason::None;
+}
+
+bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
+                                                   SchedCandidate &TryCand,
+                                                   SchedBoundary *Zone) {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = FirstValid;
@@ -188,17 +585,21 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
   // "tie-breaking" in nature.
   bool SameBoundary = Zone != nullptr;
   if (SameBoundary) {
-    // For loops that are acyclic path limited, aggressively schedule for
-    // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
-    // heuristics to take precedence.
-    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
-        tryLatency(TryCand, Cand, *Zone))
-      return TryCand.Reason != NoCand;
-
-    // Otherwise compare candidates by the stall they would introduce if
+    // Compare candidates by the stall they would introduce if
     // scheduled in the current cycle.
     if (tryEffectiveStall(Cand, TryCand, *Zone))
       return TryCand.Reason != NoCand;
+
+    Heurs.sortHWUIResources();
+    if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
+      LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
+      return TryCand.Reason != NoCand;
+    }
+
+    if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
+      LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
+      return TryCand.Reason != NoCand;
+    }
   }
 
   // Keep clustered nodes together to encourage downstream peephole
@@ -232,16 +633,6 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
     return TryCand.Reason != NoCand;
 
   if (SameBoundary) {
-    // Avoid critical resource consumption and balance the schedule.
-    TryCand.initResourceDelta(DAG, SchedModel);
-    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
-                TryCand, Cand, ResourceReduce))
-      return TryCand.Reason != NoCand;
-    if (tryGreater(TryCand.ResDelta.DemandedResources,
-                   Cand.ResDelta.DemandedResources, TryCand, Cand,
-                   ResourceDemand))
-      return TryCand.Reason != NoCand;
-
     // Avoid serializing long latency dependence chains.
     // For acyclic path limited loops, latency was already checked above.
     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 4d0eb8611ea8a..86a616968cebf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -19,12 +19,293 @@
 
 namespace llvm {
 
+//===----------------------------------------------------------------------===//
+// Instruction Flavor Classification
+//===----------------------------------------------------------------------===//
+
+enum class InstructionFlavor : uint8_t {
+  WMMA,            // WMMA/MFMA matrix operations
+  SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
+  TRANS,           // Transcendental ops (v_exp, v_log, etc.)
+  MultiCycleVALU,  // VALU instructions with repeat rate > 1
+  VMEM,            // FLAT/GLOBAL memory operations
+  DS,              // LDS/GDS operations
+  SALU,            // Scalar ALU
+  DMA,             // Tensor DMA operations
+  Fence,           // Fences and waits
+  Other,           // Everything else
+  NUM_FLAVORS
+};
+
+inline StringRef getFlavorName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "WMMA";
+  case InstructionFlavor::SingleCycleVALU:
+    return "VALU(1c)";
+  case InstructionFlavor::TRANS:
+    return "TRANS";
+  case InstructionFlavor::MultiCycleVALU:
+    return "VALU(Nc)";
+  case InstructionFlavor::VMEM:
+    return "VMEM";
+  case InstructionFlavor::DS:
+    return "DS";
+  case InstructionFlavor::SALU:
+    return "SALU";
+  case InstructionFlavor::DMA:
+    return "DMA";
+  case InstructionFlavor::Fence:
+    return "Fence";
+  case InstructionFlavor::Other:
+    return "Other";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "???";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+inline StringRef getFlavorShortName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "W";
+  case InstructionFlavor::SingleCycleVALU:
+    return "V";
+  case InstructionFlavor::TRANS:
+    return "T";
+  case InstructionFlavor::MultiCycleVALU:
+    return "C";
+  case InstructionFlavor::VMEM:
+    return "M";
+  case InstructionFlavor::DS:
+    return "D";
+  case InstructionFlavor::SALU:
+    return "S";
+  case InstructionFlavor::DMA:
+    return "X";
+  case InstructionFlavor::Fence:
+    return "F";
+  case InstructionFlavor::Other:
+    return "O";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "?";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+InstructionFlavor classifyFlavor(const MachineInstr *MI,
+                                 const SIInstrInfo *SII);
+
+using FlavorGroup = SmallVector<InstructionFlavor, 4>;
+
+namespace FlavorGroups {
+inline FlavorGroup allVALU() {
+  return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
+          InstructionFlavor::MultiCycleVALU};
+}
+inline FlavorGroup allMem() {
+  return {InstructionFlavor::VMEM, InstructionFlavor::DS,
+          InstructionFlavor::DMA};
+}
+inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
+inline FlavorGroup all() {
+  FlavorGroup G;
+  for (unsigned I = 0;
+       I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
+    G.push_back(static_cast<InstructionFlavor>(I));
+  return G;
+}
+} // namespace FlavorGroups
+
+/// AMDGPU-specific scheduling decision reasons. These provide more granularity
+/// than the generic CandReason enum for debugging purposes.
+enum class AMDGPUSchedReason : uint8_t {
+  None,
+  CritResourceBalance, // tryCriticalResource chose based on resource pressure
+  CritResourceDep,     // tryCriticalResourceDependency chose based on enabling
+  NUM_REASONS
+};
+
+inline StringRef getReasonName(AMDGPUSchedReason R) {
+  switch (R) {
+  case AMDGPUSchedReason::None:
+    return "None";
+  case AMDGPUSchedReason::CritResourceBalance:
+    return "CritResource";
+  case AMDGPUSchedReason::CritResourceDep:
+    return "CritResourceDep";
+  case AMDGPUSchedReason::NUM_REASONS:
+    return "???";
+  }
+  llvm_unreachable("Unknown AMDGPUSchedReason");
+}
+
+//===----------------------------------------------------------------------===//
+// Hardware Unit Information
+//===----------------------------------------------------------------------===//
+
+/// HardwareUnitInfo is a wrapper class which maps to some real hardware
+/// resource. This is used to model hardware resource pressure per region, and
+/// guide scheduling heuristics.
+class HardwareUnitInfo {
+private:
+  /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
+  /// for this HardwareUnit. This is used for agreement between
+  /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
+  /// dependencies for a SU on critical resource, then schedule that same SU on
+  /// the critical resource. This agreement results in shorter live ranges and
+  /// more regular HardwareUnit access patterns. SUs are prioritized based on
+  /// depth for top-down scheduling.
+  SmallSetVector<SUnit *, 16> PrioritySUs;
+  /// All the SUs in the region that consume this resource
+  SmallSetVector<SUnit *, 16> AllSUs;
+  /// The total number of busy cycles for this HardwareUnit for a given region.
+  unsigned TotalCycles = 0;
+  // InstructionFlavor mapping
+  InstructionFlavor Type;
+  // Idx mappuing
+  unsigned Idx;
+  // Whether or not instructions on this HardwareUnit may produce a window in
+  // which instructions in other HardwareUnits can coexecute. For example, WMMA
+  // / MFMA instructions may take multiple cycles, which may be overlapped with
+  // instructions on other HardwareUnits
+  bool ProducesCoexecWindow = false;
+
+public:
+  HardwareUnitInfo() {}
+
+  unsigned size() { return AllSUs.size(); }
+
+  unsigned getTotalCycles() { return TotalCycles; }
+
+  void setType(unsigned TheType) {
+    assert(TheType < (unsigned)InstructionFlavor::NUM_FLAVORS);
+    Type = (InstructionFlavor)(TheType);
+  }
+
+  InstructionFlavor getType() const { return Type; }
+
+  unsigned getIdx() const { return Idx; }
+
+  bool producesCoexecWindow() const { return ProducesCoexecWindow; }
+
+  void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
+
+  bool contains(SUnit *SU) { return AllSUs.contains(SU); }
+
+  /// \returns trrue if there is a difference in priority between \p SU and \p
+  /// Other. If so, \returns the SUnit with higher priority. This
+  /// method looks through the PrioritySUs to dtermine if one SU is more
+  /// prioritized than the other. If neither are in the PrioritySUs list, then
+  /// neither have priority over each other.
+  SUnit *getHigherPriority(SUnit *SU, SUnit *Other) {
+    for (auto *SUOrder : PrioritySUs) {
+      if (SUOrder == SU) {
+        return SU;
+      }
+      if (SUOrder == Other) {
+        return Other;
+      }
+    }
+    return nullptr;
+  }
+
+  void reset() {
+    AllSUs.clear();
+    PrioritySUs.clear();
+    TotalCycles = 0;
+    ProducesCoexecWindow = false;
+  }
+
+  /// \returns the next SU in PriortySUs that is not ready. If \p LookDeep is
+  /// set, we will look beyond the PrioritySUs (if all the PrioritSUs are ready)
+  /// to AllSUs to attempt to find a target SU. When looking through AllSUs we
+  /// sort pick the target SU by minimal depth for top-down scheduling.
+  /// getNextTargetSU is useful for determining which SU on this HardwareUnit we
+  /// are trying to schedule - this info helps us determine which dependencies
+  /// to schedule. LookDeep is useful if the dependencies are long latency (e.g.
+  /// memory instructions). If we have many lkong latency dependencies, it is
+  /// beneficial to enable SUs multiple levels ahead.
+  SUnit *getNextTargetSU(bool LookDeep = false);
+  /// insert the \p SU into the AllSUs and account its \p BlockingCycles into
+  /// the TotalCycles. This maintains the list of PrioritySUs.
+  void insert(SUnit *SU, unsigned BlockingCycles);
+  /// schedule the \p SU by removing it from the AllSus and reducing its \p
+  /// BlockingCycles from the TotalCycles. This maintains the list of
+  /// PrioritySUS.
+  void schedule(SUnit *SU, unsigned BlockingCycles);
+};
+
+//===----------------------------------------------------------------------===//
+// Candidate Heuristics
+//===----------------------------------------------------------------------===//
+
+/// CandidateHeuristics contains state and implementations to facilitate making
+/// per instruction scheduling decisions; it contains methods used in
+/// tryCandidate to decide which instruction to schedule next.
+class CandidateHeuristics {
+protected:
+  ScheduleDAGMI *DAG;
+  const SIInstrInfo *SII;
+  const SIRegisterInfo *SRI;
+  const TargetSchedModel *SchedModel;
+  SmallVector<HardwareUnitInfo, 8> HWUInfo;
+
+  /// Walk over the region and collect total usage per HardwareUnit
+  void collectHWUIPressure();
+
+  /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
+  /// SU
+  unsigned getHWUICyclesForInst(SUnit *SU);
+
+  /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
+  /// mapped HardwareUnit.
+  HardwareUnitInfo *getHWUIFromFlavor(InstructionFlavor Flavor);
+
+public:
+  CandidateHeuristics() = default;
+
+  void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
+                  const TargetRegisterInfo *TRI);
+
+  void schedNode(SUnit *SU);
+
+  /// Sort the HardwarUnitInfo vector. After sorting, the HWUI that are highest
+  /// priority are first. Priority is determined by maximizing coexecution and
+  /// keeping the critical Hardware unit busy.
+  void sortHWUIResources();
+
+  /// Check for critical resource consumption. Prefer the candidate that uses
+  /// the most prioritized HardwareUnit. If both candidates use the same
+  /// HarwareUnit, prefer the candidate with higher priority on that
+  /// HardwareUnit.
+  bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
+                           GenericSchedulerBase::SchedCandidate &Cand,
+                           SchedBoundary *Zone) const;
+
+  /// Check for dependencies of instructions that use prioritized HardwareUnits.
+  /// Prefer the candidate that is a dependency of an instruction that uses the
+  /// most prioritized HardwareUnit. If both candidates enable the same
+  /// HardwareUnit, prefer the candidate that enables the higher priority
+  /// instruction on that HardwareUnit.
+  bool
+  tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
+                                GenericSchedulerBase::SchedCandidate &Cand,
+                                SchedBoundary *Zone) const;
+
+  void dumpRegionSummary();
+};
+
 class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 protected:
-  bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone) const override;
   bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
                          SchedBoundary &Zone) const;
+  AMDGPUSchedReason LastAMDGPUReason = AMDGPUSchedReason::None;
+  CandidateHeuristics Heurs;
+
+  void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
+  bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
+                          SchedBoundary *Zone);
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &PickedPending,
@@ -35,6 +316,7 @@ class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 
   void initialize(ScheduleDAGMI *DAG) override;
   SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
 };
 
 ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 4196b3abec7ab..f568c7607d58d 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -89,19 +89,19 @@ body: |
     ; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     ;
     ; COEXEC-LABEL: name: test-sched-pending-structural-stall
-    ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC: S_NOP 0
+    ; COEXEC-NEXT: S_NOP 0
+    ; COEXEC-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: S_NOP 0
-    ; COEXEC-NEXT: S_NOP 0
     ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     %0:vreg_512_align2 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
new file mode 100644
index 0000000000000..56d0844d7f62d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-sched-strategy=coexec --verify-misched  < %s | FileCheck -check-prefix=COEXEC %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250  < %s | FileCheck -check-prefix=GCN %s
+
+
+define amdgpu_kernel void @ds_wmma(ptr addrspace(3) %base, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 {
+; COEXEC-LABEL: ds_wmma:
+; COEXEC:       ; %bb.0: ; %entry
+; COEXEC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; COEXEC-NEXT:    v_mov_b32_e32 v0, 0
+; COEXEC-NEXT:    s_clause 0x1
+; COEXEC-NEXT:    s_load_b32 s2, s[4:5], 0x0 nv
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; COEXEC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; COEXEC-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v24, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v26, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v28, v0
+; COEXEC-NEXT:    v_mov_b32_e32 v29, v0
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_bitcmp1_b32 s0, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v30, v0
+; COEXEC-NEXT:    s_cselect_b32 s0, -1, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v31, v0
+; COEXEC-NEXT:    s_xor_b32 s0, s0, -1
+; COEXEC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; COEXEC-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
+; COEXEC-NEXT:    v_cmp_ne_u32_e64 s0, 1, v32
+; COEXEC-NEXT:  .LBB0_1: ; %loop
+; COEXEC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; COEXEC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; COEXEC-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_mov_b32_e32 v80, s2
+; COEXEC-NEXT:    s_add_co_i32 s2, s2, s1
+; COEXEC-NEXT:    ds_load_tr16_b128 v[36:39], v80 offset:192
+; COEXEC-NEXT:    ds_load_tr16_b128 v[40:43], v80
+; COEXEC-NEXT:    ds_load_tr16_b128 v[44:47], v80 offset:64
+; COEXEC-NEXT:    ds_load_tr16_b128 v[32:35], v80 offset:128
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[52:55], v80 offset:448
+; COEXEC-NEXT:    ds_load_tr16_b128 v[56:59], v80 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[60:63], v80 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[48:51], v80 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[68:71], v80 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[72:75], v80 offset:512
+; COEXEC-NEXT:    ds_load_tr16_b128 v[76:79], v80 offset:576
+; COEXEC-NEXT:    s_wait_dscnt 0x3
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[64:67], v80 offset:640
+; COEXEC-NEXT:    ds_load_tr16_b128 v[84:87], v80 offset:960
+; COEXEC-NEXT:    ds_load_tr16_b128 v[88:91], v80 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[92:95], v80 offset:832
+; COEXEC-NEXT:    ds_load_tr16_b128 v[80:83], v80 offset:896
+; COEXEC-NEXT:    s_wait_dscnt 0x4
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
+; COEXEC-NEXT:    s_cbranch_vccnz .LBB0_1
+; COEXEC-NEXT:  ; %bb.2: ; %end
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_mov_b32_e32 v32, 0
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_clause 0x7
+; COEXEC-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; COEXEC-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; COEXEC-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; COEXEC-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; COEXEC-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; COEXEC-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; COEXEC-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; COEXEC-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; COEXEC-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; COEXEC-NEXT:    s_endpgm
+;
+; GCN-LABEL: ds_wmma:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; GCN-NEXT:    s_load_b32 s2, s[4:5], 0x0 nv
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GCN-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; GCN-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; GCN-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; GCN-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; GCN-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; GCN-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
+; GCN-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s0
+; GCN-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; GCN-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0
+; GCN-NEXT:    v_mov_b32_e32 v26, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 1, v24
+; GCN-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0
+; GCN-NEXT:    v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0
+; GCN-NEXT:    v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0
+; GCN-NEXT:  .LBB0_1: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v92, s2
+; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_add_co_i32 s2, s2, s1
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v92
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v92 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v92 offset:128
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v92 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v92 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v92 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v92 offset:384
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v92 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v92 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v92 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v92 offset:640
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v92 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v92 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v92 offset:832
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v92 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v92 offset:960
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_clause 0x7
+; GCN-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; GCN-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; GCN-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; GCN-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; GCN-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; GCN-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; GCN-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; GCN-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+
+  br label %loop
+
+loop:
+  %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ]
+  %wvec0 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma01,  %loop ]
+  %wvec1 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma11,  %loop ]
+  %wvec2 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma21,  %loop ]
+  %wvec3 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma31,  %loop ]
+  %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff
+  %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64
+  %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128
+  %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192
+  %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256
+  %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320
+  %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384
+  %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448
+  %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512
+  %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576
+  %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640
+  %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704
+  %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768
+  %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832
+  %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896
+  %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960
+  %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0)
+  %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1)
+  %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2)
+  %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3)
+  %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4)
+  %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5)
+  %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6)
+  %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7)
+  %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8)
+  %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9)
+  %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10)
+  %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11)
+  %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12)
+  %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13)
+  %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14)
+  %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15)
+  %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wvec0, i1 false, i1 false)
+  %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wmma00, i1 false, i1 false)
+  %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wvec1, i1 false, i1 false)
+  %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wmma10, i1 false, i1 false)
+  %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wvec2, i1 false, i1 false)
+  %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wmma20, i1 false, i1 false)
+  %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wvec3, i1 false, i1 false)
+  %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wmma30, i1 false, i1 false)
+  %newBaseOff = or disjoint i32 %baseOff, %delta
+  br i1 %br0, label %loop, label %end
+
+end:
+  %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128
+  %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256
+  %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384
+  store <8 x float> %wmma01, ptr addrspace(1) %out, align 16
+  store <8 x float> %wmma11, ptr addrspace(1) %out1, align 16
+  store <8 x float> %wmma21, ptr addrspace(1) %out2, align 16
+  store <8 x float> %wmma31, ptr addrspace(1) %out3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace(3) %base1, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 {
+; COEXEC-LABEL: ds_wmma_permute:
+; COEXEC:       ; %bb.0: ; %entry
+; COEXEC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; COEXEC-NEXT:    s_mov_b32 s6, 0
+; COEXEC-NEXT:    s_clause 0x1
+; COEXEC-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; COEXEC-NEXT:    v_mov_b32_e32 v0, 0
+; COEXEC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; COEXEC-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v24, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v26, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v28, v0
+; COEXEC-NEXT:    v_mov_b32_e32 v29, v0
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_bitcmp1_b32 s0, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v30, v0
+; COEXEC-NEXT:    s_cselect_b32 s0, -1, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v31, v0
+; COEXEC-NEXT:    s_xor_b32 s0, s0, -1
+; COEXEC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; COEXEC-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s0
+; COEXEC-NEXT:    v_cmp_ne_u32_e64 s0, 1, v32
+; COEXEC-NEXT:  .LBB1_1: ; %loop
+; COEXEC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; COEXEC-NEXT:    s_add_co_i32 s7, s2, s6
+; COEXEC-NEXT:    s_add_co_i32 s8, s3, s6
+; COEXEC-NEXT:    s_add_co_i32 s6, s6, s1
+; COEXEC-NEXT:    v_dual_mov_b32 v124, s7 :: v_dual_mov_b32 v156, s8
+; COEXEC-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; COEXEC-NEXT:    ds_load_tr16_b128 v[32:35], v124
+; COEXEC-NEXT:    ds_load_tr16_b128 v[40:43], v156
+; COEXEC-NEXT:    ds_load_tr16_b128 v[44:47], v156 offset:64
+; COEXEC-NEXT:    ds_load_tr16_b128 v[36:39], v124 offset:64
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[52:55], v124 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[60:63], v156 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[48:51], v124 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[56:59], v156 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[68:71], v124 offset:576
+; COEXEC-NEXT:    ds_load_tr16_b128 v[76:79], v156 offset:576
+; COEXEC-NEXT:    ds_load_tr16_b128 v[64:67], v124 offset:512
+; COEXEC-NEXT:    s_wait_dscnt 0x3
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[72:75], v156 offset:512
+; COEXEC-NEXT:    ds_load_tr16_b128 v[84:87], v124 offset:832
+; COEXEC-NEXT:    ds_load_tr16_b128 v[92:95], v156 offset:832
+; COEXEC-NEXT:    ds_load_tr16_b128 v[80:83], v124 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[88:91], v156 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[96:99], v124 offset:128
+; COEXEC-NEXT:    ds_load_tr16_b128 v[100:103], v124 offset:192
+; COEXEC-NEXT:    s_wait_dscnt 0x6
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[104:107], v124 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[108:111], v124 offset:448
+; COEXEC-NEXT:    ds_load_tr16_b128 v[112:115], v124 offset:640
+; COEXEC-NEXT:    ds_load_tr16_b128 v[116:119], v124 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[120:123], v124 offset:896
+; COEXEC-NEXT:    ds_load_tr16_b128 v[124:127], v124 offset:960
+; COEXEC-NEXT:    ds_load_tr16_b128 v[128:131], v156 offset:128
+; COEXEC-NEXT:    s_wait_dscnt 0x9
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[132:135], v156 offset:192
+; COEXEC-NEXT:    ds_load_tr16_b128 v[136:139], v156 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[140:143], v156 offset:448
+; COEXEC-NEXT:    ds_load_tr16_b128 v[144:147], v156 offset:640
+; COEXEC-NEXT:    ds_load_tr16_b128 v[148:151], v156 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[152:155], v156 offset:896
+; COEXEC-NEXT:    ds_load_tr16_b128 v[156:159], v156 offset:960
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT:    s_wait_dscnt 0x6
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT:    s_wait_dscnt 0x4
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT:    s_wait_dscnt 0x2
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT:    s_cbranch_vccnz .LBB1_1
+; COEXEC-NEXT:  ; %bb.2: ; %end
+; COEXEC-NEXT:    v_mov_b32_e32 v32, 0
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_clause 0x7
+; COEXEC-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; COEXEC-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; COEXEC-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; COEXEC-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; COEXEC-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; COEXEC-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; COEXEC-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; COEXEC-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; COEXEC-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; COEXEC-NEXT:    s_endpgm
+;
+; GCN-LABEL: ds_wmma_permute:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GCN-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; GCN-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; GCN-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; GCN-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; GCN-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; GCN-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
+; GCN-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s0
+; GCN-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; GCN-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0
+; GCN-NEXT:    v_mov_b32_e32 v26, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 1, v24
+; GCN-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0
+; GCN-NEXT:    v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0
+; GCN-NEXT:    v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0
+; GCN-NEXT:  .LBB1_1: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_add_co_i32 s7, s2, s6
+; GCN-NEXT:    s_add_co_i32 s8, s3, s6
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v96, s7 :: v_dual_mov_b32 v97, s8
+; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_add_co_i32 s6, s6, s1
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v96
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v96 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v97
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v97 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v96 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v96 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v97 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v97 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v96 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v96 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v97 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v97 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v96 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v96 offset:832
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v97 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v97 offset:832
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v96 offset:128
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v96 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v97 offset:128
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v97 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v96 offset:384
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v96 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v97 offset:384
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v97 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v96 offset:640
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v96 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v97 offset:640
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v97 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v96 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v96 offset:960
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v97 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v97 offset:960
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    s_cbranch_vccnz .LBB1_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_clause 0x7
+; GCN-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; GCN-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; GCN-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; GCN-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; GCN-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; GCN-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; GCN-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; GCN-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+
+  br label %loop
+
+loop:
+  %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ]
+  %wvec0 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma01,  %loop ]
+  %wvec1 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma11,  %loop ]
+  %wvec2 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma21,  %loop ]
+  %wvec3 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma31,  %loop ]
+  %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff
+  %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64
+  %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128
+  %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192
+  %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256
+  %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320
+  %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384
+  %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448
+  %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512
+  %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576
+  %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640
+  %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704
+  %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768
+  %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832
+  %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896
+  %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960
+  %bp0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base1, i32 %baseOff
+  %bp1 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 64
+  %bp2 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 128
+  %bp3 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 192
+  %bp4 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 256
+  %bp5 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 320
+  %bp6 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 384
+  %bp7 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 448
+  %bp8 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 512
+  %bp9 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 576
+  %bp10 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 640
+  %bp11 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 704
+  %bp12 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 768
+  %bp13 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 832
+  %bp14 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 896
+  %bp15 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 960
+
+  %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0)
+  %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1)
+  %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2)
+  %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3)
+  %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4)
+  %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5)
+  %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6)
+  %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7)
+  %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8)
+  %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9)
+  %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10)
+  %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11)
+  %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12)
+  %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13)
+  %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14)
+  %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15)
+  %bl0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %bp0)
+  %bl1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp1)
+  %bl2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp2)
+  %bl3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp3)
+  %bl4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp4)
+  %bl5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp5)
+  %bl6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp6)
+  %bl7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp7)
+  %bl8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp8)
+  %bl9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp9)
+  %bl10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp10)
+  %bl11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp11)
+  %bl12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp12)
+  %bl13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp13)
+  %bl14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp14)
+  %bl15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp15)
+  %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec0 = shufflevector <8 x half> %bl0, <8 x half> %bl1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec1 = shufflevector <8 x half> %bl2, <8 x half> %bl3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec2 = shufflevector <8 x half> %bl4, <8 x half> %bl5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec3 = shufflevector <8 x half> %bl6, <8 x half> %bl7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec4 = shufflevector <8 x half> %bl8, <8 x half> %bl9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec5 = shufflevector <8 x half> %bl10, <8 x half> %bl11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec6 = shufflevector <8 x half> %bl12, <8 x half> %bl13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec7 = shufflevector <8 x half> %bl14, <8 x half> %bl15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wvec0, i1 false, i1 false)
+  %bwmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wmma00, i1 false, i1 false)
+  %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %bwmma00, i1 false, i1 false)
+  %bwmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %wmma01, i1 false, i1 false)
+  %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wvec1, i1 false, i1 false)
+  %bwmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wmma10, i1 false, i1 false)
+  %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %bwmma10, i1 false, i1 false)
+  %bwmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %wmma11, i1 false, i1 false)
+  %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wvec2, i1 false, i1 false)
+  %bwmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wmma20, i1 false, i1 false)
+  %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %bwmma20, i1 false, i1 false)
+  %bwmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %wmma21, i1 false, i1 false)
+  %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wvec3, i1 false, i1 false)
+  %bwmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wmma30, i1 false, i1 false)
+  %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %bwmma30, i1 false, i1 false)
+  %bwmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %wmma31, i1 false, i1 false)
+  %newBaseOff = or disjoint i32 %baseOff, %delta
+  br i1 %br0, label %loop, label %end
+
+end:
+  %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128
+  %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256
+  %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384
+  store <8 x float> %bwmma01, ptr addrspace(1) %out, align 16
+  store <8 x float> %bwmma11, ptr addrspace(1) %out1, align 16
+  store <8 x float> %bwmma21, ptr addrspace(1) %out2, align 16
+  store <8 x float> %bwmma31, ptr addrspace(1) %out3, align 16
+  ret void
+}
+
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" "amdgpu-waves-per-eu"="1,1" }

>From b0ecd041f9df4c937a677c4902d1db990cc21392 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 5 Mar 2026 17:57:43 -0800
Subject: [PATCH 2/2] Change old code

Change-Id: I26cff6c0c5743684778f022b264c9930eeff24ce
---
 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index cec06ff514697..59c2536592cb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -54,7 +54,10 @@ InstructionFlavor llvm::classifyFlavor(const MachineInstr *MI,
       Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
     return InstructionFlavor::Fence;
 
-  if ((SII->isFLAT(*MI) || SII->isFLATGlobal(*MI)) && SII->isDS(*MI))
+  if (Opc == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
+      Opc == AMDGPU::TENSOR_LOAD_TO_LDS ||
+      Opc == AMDGPU::GLOBAL_LOAD_ASYNC_TO_LDS_B32 ||
+      Opc == AMDGPU::GLOBAL_LOAD_ASYNC_TO_LDS_B32_SADDR)
     return InstructionFlavor::DMA;
 
   if (SII->isMFMAorWMMA(*MI))
@@ -87,7 +90,6 @@ SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) {
   if (!LookDeep)
     return nullptr;
 
-  // TODO -- we may want to think about more advance strategies here.
   unsigned MinDepth = std::numeric_limits<unsigned int>::max();
   SUnit *TargetSU = nullptr;
   for (auto *SU : AllSUs) {