[llvm] [AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp interleaving (PR #80370)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 7 10:55:08 PST 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/80370

>From eb4594624e505ae7a0d7bca13c4d54e5d5425a0d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 1 Feb 2024 13:57:13 -0800
Subject: [PATCH 1/2] [AMDGPU] Introduce IGLPPhase

Change-Id: I3690e082b98b57392075cac783b853f3fb48b0e5
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 28 +++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       | 10 ++++++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  6 ++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  6 ++--
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13e..74b62f22aff216 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -852,7 +852,7 @@ class IGLPStrategy {
   virtual void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) = 0;
+      IGLPPhase Phase) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -871,7 +871,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      IGLPPhase Phase) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -884,7 +884,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 void MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    IGLPPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -1101,7 +1101,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      IGLPPhase Phase) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -1118,11 +1118,11 @@ static unsigned DSWWithSharedVMEMCount = 0;
 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    IGLPPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
+  assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
                         DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
@@ -1133,7 +1133,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore() && !IsReentry) {
+      else if (I->mayStore() && Phase == IGLPPhase::Initial) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1146,7 +1146,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     }
   }
 
-  if (!IsReentry) {
+  if (Phase == IGLPPhase::Initial) {
     DSWWithPermCount = DSWithPerms.size();
     auto I = DSWithPerms.begin();
     auto E = DSWithPerms.end();
@@ -1414,10 +1414,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   bool IsBottomUp = 1;
 
   // Whether or not this is a reentry into the IGroupLPDAGMutation.
-  bool IsReentry = false;
+  IGLPPhase Phase = IGLPPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
+  IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -1717,7 +1717,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (S->shouldApplyStrategy(DAG)) {
     IsBottomUp = S->IsBottomUp;
-    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
+    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
   }
 }
 
@@ -1725,13 +1725,13 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 
 namespace llvm {
 
-/// \p IsReentry specifes whether or not this is a reentry into the
+/// \p Phase specifes whether or not this is a reentry into the
 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
-  return std::make_unique<IGroupLPDAGMutation>(IsReentry);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 3ec8be4f889205..0b72c3dbecce1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,15 @@
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry);
+// Components of the mask that determines which instruction types may be may be
+// classified into a SchedGroup.
+enum class IGLPPhase {
+  Initial = 0u,
+  PreRAReentry = 1u << 0,
+  PostRA = 1u << 1
+};
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b8a7a5e2080213..cfebf72c4f42a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -471,7 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
   return DAG;
 }
 
@@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
+    DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4081115aa68cad..e67a296bcada17 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -855,7 +855,7 @@ bool GCNSchedStage::initGCNRegion() {
     SavedMutations.swap(DAG.Mutations);
     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
                           StageID == GCNSchedStageID::ILPInitialSchedule;
-    DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
+    DAG.addMutation(createIGroupLPDAGMutation(IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
   }
 
   return true;
@@ -1569,7 +1569,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
+    addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();

>From c0a909d8da1f804f764349af5ae9a54c33c11c55 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Jan 2024 12:44:12 -0800
Subject: [PATCH 2/2] [AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp
 interleaving

Change-Id: I593cc0ffa7ba1ddcec670028db5fc305fc355e85
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |  894 ++++++-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       |    6 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |    3 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 2055 +++++++++++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  900 ++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir  |  646 ++++++
 6 files changed, 4478 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 74b62f22aff216..2e8783c12fdddf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
   // Count of the number of created SchedGroups, used to initialize SGID.
   static unsigned NumSchedGroups;
 
-  const SIInstrInfo *TII;
-
   // Try to add and edge from SU A to SU B.
   bool tryAddEdge(SUnit *A, SUnit *B);
 
@@ -154,6 +152,7 @@ class SchedGroup {
   SmallVector<SUnit *, 32> Collection;
 
   ScheduleDAGInstrs *DAG;
+  const SIInstrInfo *TII;
 
   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 };
@@ -442,7 +441,8 @@ void PipelineSolver::convertSyncMapsToArrays() {
 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
   for (; I != E; ++I) {
     auto &GroupA = *I;
-    for (auto J = std::next(I); J != E; ++J) {
+    auto J = std::next(I);
+    for (; J != E; ++J) {
       auto &GroupB = *J;
       GroupA.link(GroupB);
     }
@@ -488,7 +488,9 @@ int PipelineSolver::linkSUnit(
       continue;
     }
     auto Group = *I;
-    AddedCost += Group.link(*SU, MakePred, AddedEdges);
+    auto Temp = Group.link(*SU, MakePred, AddedEdges);
+
+    AddedCost += Temp;
     assert(AddedCost >= 0);
   }
   return AddedCost;
@@ -633,6 +635,7 @@ bool PipelineSolver::solveExact() {
   assert(static_cast<size_t>(CurrConflInstNo) <
          PipelineInstrs[CurrSyncGroupIdx].size());
   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+
   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
 
@@ -785,6 +788,7 @@ bool PipelineSolver::solveGreedy() {
 
   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+
     IsBottomUp
         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
@@ -838,6 +842,7 @@ void PipelineSolver::solve() {
 enum IGLPStrategyID : int {
   MFMASmallGemmOptID = 0,
   MFMASmallGemmSingleWaveOptID = 1,
+  MFMAExpInterleave = 2
 };
 
 // Implement a IGLP scheduling strategy.
@@ -904,6 +909,851 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+  /// Whether or not the instruction is a transitive predecessor of an MFMA
+  /// instruction
+  class IsPipeExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+
+      if (Cache->empty()) {
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++) {
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Cache->push_back(&*I);
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      auto Reaches = (std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+          }));
+
+      return Reaches;
+    }
+    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the insturction is a transitive predecessor of the same
+  /// MFMA instruction as an instruction in a SchedGroup \p Number steps before
+  class ProduceSameMFMAWithPrevN final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Number) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        auto TII = SyncPipe[0].TII;
+        SmallVector<SUnit *, 8> Worklist;
+
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++)
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Worklist.push_back(&*I);
+
+        for (auto BaseSU : OtherGroup->Collection) {
+          if (!Cache->empty())
+            break;
+          for (auto CandSU : Worklist) {
+            if (DAG->IsReachable(CandSU, BaseSU)) {
+              Cache->push_back(CandSU);
+              break;
+            }
+          }
+        }
+      }
+      if (Cache->empty())
+        return false;
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    ProduceSameMFMAWithPrevN(unsigned Number, const SIInstrInfo *TII,
+                             unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the instruction has less than \p Size immediate successors
+  class LessThanNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      return SU->Succs.size() < Size;
+    }
+    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size) {}
+  };
+
+  // Whether or not the instruction is an V_CVT instruction.
+  class IsPipelineCvt final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto Opc = SU->getInstr()->getOpcode();
+      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32;
+    }
+    IsPipelineCvt(const SIInstrInfo *TII, unsigned SGID,
+                  bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is an V_FMA_F32 instruction.
+  class IsFMAF32 final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64;
+    }
+    IsFMAF32(unsigned Val, const SIInstrInfo *TII, unsigned SGID,
+             bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction is an immediate RAW successor
+  /// of the SchedGroup \p Distance steps before.
+  class IsSuccOfPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      for (auto &OtherEle : OtherGroup->Collection) {
+        for (auto &Succ : OtherEle->Succs) {
+          if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+            return true;
+        }
+      }
+
+      return false;
+    }
+    IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                         unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is a transitive successor of any
+  /// instruction the the SchedGroup \p Distance steps before.
+  class IsReachableFromPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      for (auto &OtherEle : OtherGroup->Collection)
+        if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+          return true;
+
+      return false;
+    }
+    IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is the \p Number th occuring DS_READ
+  /// instruction
+  class IsNthDSRead final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      unsigned Counter = 0;
+      if (Cache->empty()) {
+        for (auto &ParseSU : DAG->SUnits) {
+          auto MI = ParseSU.getInstr();
+          if (TII->isDS(MI->getOpcode()) && MI->mayLoad()) {
+            if (Counter == Number) {
+              Cache->push_back(&ParseSU);
+              break;
+            }
+
+            ++Counter;
+          }
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return (*Cache)[0]->NodeNum <= SU->NodeNum;
+    }
+    IsNthDSRead(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+                bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  // Whether or not the instruction is a transitive predecessor of any TRANS
+  // instruction
+  class IsPipeMFMA final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode()))
+            Cache->push_back(&SU);
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return !(
+          std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *BaseSU) {
+            return DAG->IsReachable(BaseSU, const_cast<SUnit *>(SU));
+          }));
+    }
+
+    IsPipeMFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether the instruction occurs after the first TRANS instruction. This
+  // implies the instruction can not be a predecessor of the first TRANS
+  // insruction
+  class OccursAfterExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+            Cache->push_back(&SU);
+            break;
+          }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return SU->NodeNum > (*Cache)[0]->NodeNum;
+    }
+
+    OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether the SU is a not a successor of any element in the previous
+  // SchedGroup
+  class IsNotSuccOfPrevGroup final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - 1) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      // Does the previous VALU have this DS_Write as a successor
+      return !(std::any_of(OtherGroup->Collection.begin(),
+                           OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
+                             return std::any_of(Elt->Succs.begin(),
+                                                Elt->Succs.end(),
+                                                [&SU](SDep &Succ) {
+                                                  return Succ.getSUnit() == SU;
+                                                });
+                           }));
+    }
+    IsNotSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
+                         bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+public:
+  void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      IGLPPhase Phase) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 0;
+  }
+};
+
+static unsigned TransPipeCount = 0;
+static unsigned MFMAPipeCount = 0;
+static unsigned MFMAEnablement = 0;
+static unsigned ExpRequirement = 0;
+
+void MFMAExpInterleaveOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    IGLPPhase Phase) {
+
+  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (Phase == IGLPPhase::Initial) {
+    SmallVector<SUnit *, 10> ExpPipeCands;
+    SmallVector<SUnit *, 10> MFMAPipeCands;
+    SmallVector<SUnit *, 10> MFMAPipeSUs;
+    SmallVector<SUnit *, 10> PackSUs;
+
+    auto isBitPack = [](unsigned Opc) {
+      return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
+    };
+    for (SUnit &SU : DAG->SUnits) {
+      auto Opc = SU.getInstr()->getOpcode();
+      if (TII->isTRANS(Opc)) {
+        // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+        if (SU.Succs.size() >= 7)
+          continue;
+        ExpPipeCands.push_back(&SU);
+      }
+
+      if (TII->isMFMAorWMMA(*SU.getInstr()))
+        MFMAPipeCands.push_back(&SU);
+
+      if (isBitPack(Opc))
+        PackSUs.push_back(&SU);
+    }
+
+    if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+      return;
+
+    TransPipeCount = 0;
+    MFMAPipeCount = 0;
+    MFMAEnablement = 0;
+    ExpRequirement = 0;
+
+    std::optional<SUnit *> TempMFMA;
+    std::optional<SUnit *> TempExp;
+    // Count the number of EXPs that reach an MFMA
+    for (auto &PredSU : ExpPipeCands) {
+      for (auto &SuccSU : MFMAPipeCands) {
+        if (DAG->IsReachable(SuccSU, PredSU)) {
+          if (!TempExp) {
+            TempExp = PredSU;
+            TempMFMA = SuccSU;
+          }
+          MFMAPipeSUs.push_back(SuccSU);
+          ++TransPipeCount;
+          break;
+        }
+      }
+    }
+
+    if (!TempExp)
+      return;
+
+    // Count the number of MFMAs that are reached by an EXP
+    for (auto &SuccSU : MFMAPipeCands) {
+      if (std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+                       [&SuccSU](SUnit *PotentialMatch) {
+                         return PotentialMatch == SuccSU;
+                       }) != MFMAPipeSUs.end()) {
+        ++MFMAPipeCount;
+        continue;
+      }
+      for (auto &PredSU : ExpPipeCands) {
+        if (DAG->IsReachable(SuccSU, PredSU)) {
+          ++MFMAPipeCount;
+          break;
+        }
+      }
+    }
+
+    if (!TempMFMA || !TempExp)
+      return;
+
+    // The number of bit pack operations that depend on a single V_EXP
+    unsigned PackSuccCount = std::count_if(
+        PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
+          return DAG->IsReachable(VPack, *TempExp);
+        });
+
+    // The number of bit pack operations an MFMA depends on
+    unsigned PackPredCount =
+        std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                      [&isBitPack](SDep &Pred) {
+                        auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                        return isBitPack(Opc);
+                      });
+
+    auto PackPred =
+        std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                     [&isBitPack](SDep &Pred) {
+                       auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                       return isBitPack(Opc);
+                     });
+
+    if (PackPred == (*TempMFMA)->Preds.end())
+      return;
+
+    // How many MFMAs depend on a single bit pack operation
+    MFMAEnablement =
+        std::count_if(PackPred->getSUnit()->Succs.begin(),
+                      PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
+                        return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                      });
+
+    // The number of MFMAs that depend on a single V_EXP
+    MFMAEnablement *= PackSuccCount;
+
+    // The number of V_EXPs required to resolve all dependencies for an MFMA
+    ExpRequirement =
+        std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
+                      [this, &PackPred](SUnit *ExpBase) {
+                        return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
+                      });
+
+    ExpRequirement *= PackPredCount;
+  }
+
+  bool IsSmallKernelType =
+      MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
+  bool IsLargeKernelType =
+      MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
+  bool IsTinyKernelType =
+      MFMAEnablement == 1 && ExpRequirement == 4 && TransPipeCount == 32;
+
+  if (!(IsSmallKernelType || IsLargeKernelType || IsTinyKernelType))
+    return;
+
+  unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+
+  if (IsSmallKernelType && Phase != IGLPPhase::PostRA) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 4, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < 4; I++) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+      SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(1 + 2 * I, TII,
+                                                         SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    for (unsigned I = 0; I < (TransPipeCount - 8) / 2; ++I) {
+
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+      for (unsigned J = 0; J < 2; J++) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+        SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(I == 0 ? 8 : 9, TII,
+                                                           SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+    }
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsNotSuccOfPrevGroup>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsNotSuccOfPrevGroup>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  if (IsTinyKernelType) {
+    bool IsPostRA = Phase == IGLPPhase::PostRA;
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<ProduceSameMFMAWithPrevN>(
+        1, TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<ProduceSameMFMAWithPrevN>(
+        2, TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<ProduceSameMFMAWithPrevN>(
+        3, TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < 4; I++) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+      if (!IsPostRA)
+        SG->addRule(
+            std::make_shared<IsSuccOfPrevNthGroup>(4 + I, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      if (I != 0)
+        SG->addRule(std::make_shared<ProduceSameMFMAWithPrevN>(
+            2 * I, TII, SG->getSGID(), true));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < 6; ++I) {
+
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+      if (IsPostRA)
+        SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+      else
+        SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+      SG->addRule(
+          std::make_shared<IsReachableFromPrevNthGroup>(8, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+      for (unsigned J = 0; J < 4; J++) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+        if (!IsPostRA)
+          SG->addRule(
+              std::make_shared<IsSuccOfPrevNthGroup>(8, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+        if ((J + 4 * I) % 4)
+          SG->addRule(std::make_shared<ProduceSameMFMAWithPrevN>(
+              2 * ((J + 4 * I) % 4), TII, SG->getSGID(), true));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+    }
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(
+        std::make_shared<IsReachableFromPrevNthGroup>(8, TII, SG->getSGID()));
+    if (IsPostRA)
+      SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    else
+      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsNotSuccOfPrevGroup>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 2, PipelineSyncID, DAG, TII);
+    if (IsPostRA)
+      SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    else
+      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+    SG->addRule(
+        std::make_shared<IsReachableFromPrevNthGroup>(1, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  if (IsLargeKernelType && Phase != IGLPPhase::PostRA) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < 3; I++) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+      SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+          I == 0 ? 1 : 4, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsNthDSRead>(32, TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < 2; I++) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+      SG->addRule(
+          std::make_shared<IsReachableFromPrevNthGroup>(5, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    for (unsigned I = 0; I < (TransPipeCount - 8) / 4; I++) {
+      for (unsigned J = 0; J < 2; J++) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            I == 0 ? 16 + J : 20, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            I == 0 ? 5 + J : 6, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+        SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsNthDSRead>(32, TII, SG->getSGID(), true));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+      for (unsigned J = 0; J < 2; J++) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            I == 0 ? 19 : 21 - J, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            7, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+        SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+    }
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(15 + 1 * 5, TII,
+                                                              SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->addRule(
+        std::make_shared<IsReachableFromPrevNthGroup>(7, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsFMAF32>(1, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(15 + 1 * 5, TII,
+                                                              SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+        6, TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(14 + 1 * 4, TII,
+                                                              SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipelineCvt>(TII, SG->getSGID()));
+    SG->addRule(
+        std::make_shared<IsReachableFromPrevNthGroup>(4, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 5, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeMFMA>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  if (IsLargeKernelType && (Phase == IGLPPhase::PostRA)) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 6, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    for (unsigned I = 0; I < (TransPipeCount - 7); I++) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<LessThanNSuccs>(10, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 6, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+}
+
 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 private:
   // Whether the DS_READ is a predecessor of first four MFMA in region
@@ -1122,9 +1972,10 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
-                        DSWWithSharedVMEMCount == 0)) &&
-         "DSWCounters should be zero in pre-RA scheduling!");
+  assert(
+      (Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+                                       DSWWithSharedVMEMCount == 0)) &&
+      "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
     auto I = SU.getInstr();
@@ -1254,14 +2105,14 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         1, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1272,7 +2123,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         3, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1290,7 +2141,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1311,7 +2162,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1325,7 +2176,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1336,7 +2187,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         2, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1347,7 +2198,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         4, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1364,6 +2215,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
   case MFMASmallGemmSingleWaveOptID:
     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+  case MFMAExpInterleave:
+    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -1591,9 +2444,9 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
     auto &SU = *I;
     if (isFull())
       break;
-
-    if (canAddSU(SU))
+    if (canAddSU(SU)) {
       SyncedInstrs[&SU].push_back(SGID);
+    }
   }
 }
 
@@ -1730,7 +2583,8 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase) {
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(IGLPPhase Phase) {
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 0b72c3dbecce1c..dec723fbf1fb83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -16,11 +16,7 @@ namespace llvm {
 
 // Components of the mask that determines which instruction types may be may be
 // classified into a SchedGroup.
-enum class IGLPPhase {
-  Initial = 0u,
-  PreRAReentry = 1u << 0,
-  PostRA = 1u << 1
-};
+enum class IGLPPhase { Initial = 0u, PreRAReentry = 1u << 0, PostRA = 1u << 1 };
 
 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase);
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e67a296bcada17..d8e28c9ec9409c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -855,7 +855,8 @@ bool GCNSchedStage::initGCNRegion() {
     SavedMutations.swap(DAG.Mutations);
     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
                           StageID == GCNSchedStageID::ILPInitialSchedule;
-    DAG.addMutation(createIGroupLPDAGMutation(IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
+    DAG.addMutation(createIGroupLPDAGMutation(
+        IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
   }
 
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
new file mode 100644
index 00000000000000..5ae2eb43180fad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -0,0 +1,2055 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @largeInterleave() #0 { ret void }
+  ; GCN-LABEL: largeInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr94
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr106
+  ; GCN-NEXT:    ; implicit-def: $vgpr128
+  ; GCN-NEXT:    ; implicit-def: $vgpr129
+  ; GCN-NEXT:    ; implicit-def: $vgpr135
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
+  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
+  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
+  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
+  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
+  ; GCN-NEXT:    ; kill: killed $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; kill: killed $vgpr92
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    ; implicit-def: $vgpr73
+  ; GCN-NEXT:    v_add_u32_e32 v76, v128, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_add_u32_e32 v72, v128, v73
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr74
+  ; GCN-NEXT:    v_add_u32_e32 v72, v128, v74
+  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v128, v75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
+  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
+  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
+  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
+  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
+  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
+  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
+  ; GCN-NEXT:    s_nop 5
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
+  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
+  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
+  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
+  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
+  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr71
+  ; GCN-NEXT:    ; implicit-def: $vgpr69
+  ; GCN-NEXT:    ; implicit-def: $vgpr70
+  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
+  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
+  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v131, v66, v65, 1
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v129, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v132, v66, 1, v131
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v133, v66, 1, v132
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
+  ; GCN-NEXT:    v_lshl_add_u32 v134, v66, 1, v133
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v131, v[94:95]
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v129, v64
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v132, v[98:99]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v133, v[102:103]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v134, v[96:97]
+  ; GCN-NEXT:    v_add_u32_e32 v71, v128, v71
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v130, v66, v64
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v130
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v52, -v130
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v53, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v53, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v54, -v130
+  ; GCN-NEXT:    ; implicit-def: $vgpr54
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[152:153], v71, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v54, v128, v54
+  ; GCN-NEXT:    buffer_load_dwordx2 v[154:155], v54, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v54, v128, v69
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v54, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v54, v128, v70
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v54, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v48, v51
+  ; GCN-NEXT:    ds_read_b128 v[110:113], v135
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v52
+  ; GCN-NEXT:    ds_read_b128 v[114:117], v135 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[118:121], v135 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v48, v54
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
+  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_fma_f32 v80, s4, v55, -v130
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v55, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v56, -v130
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mul_f32_e32 v56, 0x3fb8aa3b, v80
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v96, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    v_exp_f32_e32 v53, v53
+  ; GCN-NEXT:    v_exp_f32_e32 v55, v55
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[110:111], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v56
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v135 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v122, v53
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v54
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v161, s4, v61, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[114:115], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v115, s4, v58, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v58, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v114, v55
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v150, v122, v123
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v62, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[118:119], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v118, v56
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v160, v58
+  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v44, 0x3fb8aa3b, v44
+  ; GCN-NEXT:    v_pack_b32_f16 v151, v114, v118
+  ; GCN-NEXT:    v_fma_f32 v114, s4, v59, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v115
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[112:113], v[150:151], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v112, s4, v60, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v114
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v162, v59
+  ; GCN-NEXT:    v_fma_f32 v45, s4, v45, -v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v163, v60
+  ; GCN-NEXT:    v_fma_f32 v46, s4, v46, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[116:117], v[150:151], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v112
+  ; GCN-NEXT:    v_pack_b32_f16 v160, v160, v162
+  ; GCN-NEXT:    v_fma_f32 v47, s4, v47, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v45, 0x3fb8aa3b, v45
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[120:121], v[150:151], v[96:111]
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v57
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    ; implicit-def: $vgpr57
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[136:137], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v61
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v63, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v63, 0x3fb8aa3b, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v63
+  ; GCN-NEXT:    v_pack_b32_f16 v161, v163, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_fma_f32 v163, s4, v34, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[138:139], v[150:151], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v32, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v32, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v162, v62
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v36, -v130
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v130
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v141, s4, v33, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v33, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v63
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v130
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v130
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v34, v136
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v144, v32
+  ; GCN-NEXT:    ds_read_b128 v[148:151], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[136:137], v[160:161], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v33
+  ; GCN-NEXT:    v_perm_b32 v137, v158, v156, s5
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v130
+  ; GCN-NEXT:    v_fma_f32 v30, s4, v30, -v130
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v144, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v162, v140
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v35, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v35, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v163
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v36, v136
+  ; GCN-NEXT:    v_perm_b32 v136, v154, v152, s5
+  ; GCN-NEXT:    v_perm_b32 v140, v154, v152, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v154, 0x3fb8aa3b, v162
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v131, v[136:137]
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_perm_b32 v142, v155, v153, s5
+  ; GCN-NEXT:    v_perm_b32 v152, v155, v153, s8
+  ; GCN-NEXT:    v_perm_b32 v141, v158, v156, s8
+  ; GCN-NEXT:    v_perm_b32 v143, v159, v157, s5
+  ; GCN-NEXT:    v_perm_b32 v153, v159, v157, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v147, s4, v37, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v37, v154
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v38, -v130
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v132, v[140:141]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v133, v[142:143]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v134, v[152:153]
+  ; GCN-NEXT:    ; implicit-def: $vgpr137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v163, v34
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[138:139], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v38, v136
+  ; GCN-NEXT:    ; implicit-def: $vgpr136
+  ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v35
+  ; GCN-NEXT:    v_add_u32_e32 v139, v128, v139
+  ; GCN-NEXT:    v_add_u32_e32 v136, v128, v136
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[152:153], v139, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[154:155], v136, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v136, v128, v137
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v136, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v136, v128, v138
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v136, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v135
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[148:149], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v162, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v148, v37
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v147
+  ; GCN-NEXT:    v_fma_f32 v149, s4, v39, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v39, v147
+  ; GCN-NEXT:    v_pack_b32_f16 v161, v162, v148
+  ; GCN-NEXT:    v_pack_b32_f16 v160, v163, v146
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[150:151], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v135 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v145, s4, v40, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v40, v144
+  ; GCN-NEXT:    v_fma_f32 v163, s4, v41, -v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v162, v38
+  ; GCN-NEXT:    v_fma_f32 v31, s4, v31, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[136:137], v[160:161], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    v_exp_f32_e32 v41, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v145
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v135 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[148:151], v135 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v137, v39
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[160:161], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v40
+  ; GCN-NEXT:    v_fma_f32 v141, s4, v42, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v42, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v41
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v130
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[144:145], v[160:161], v[96:111]
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v140, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v163
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v162, v137
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v43, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v43, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v162, v42
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[138:139], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v164, v43
+  ; GCN-NEXT:    v_fma_f32 v4, s4, v4, -v130
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v5, -v130
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v5, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v136
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v130
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v130
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v14, -v130
+  ; GCN-NEXT:    v_fma_f32 v15, s4, v15, -v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[146:147], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v166, v44
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v163
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v165
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[148:149], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v44, 0x3fb8aa3b, v46
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v45
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v162, v164
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[150:151], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v44, 0x3fb8aa3b, v47
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v166
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[136:137], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v162, v44
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v137, v161
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v160
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_pack_b32_f16 v136, v150, v136
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v164, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v162
+  ; GCN-NEXT:    v_fma_f32 v140, s4, v19, -v130
+  ; GCN-NEXT:    v_perm_b32 v19, v158, v156, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v140
+  ; GCN-NEXT:    v_pack_b32_f16 v137, v137, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[44:45], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_perm_b32 v17, v158, v156, s5
+  ; GCN-NEXT:    v_perm_b32 v18, v154, v152, s8
+  ; GCN-NEXT:    v_perm_b32 v44, v155, v153, s5
+  ; GCN-NEXT:    v_perm_b32 v45, v159, v157, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v168, v164
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[138:139], v[136:137], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v169, v16
+  ; GCN-NEXT:    v_perm_b32 v16, v154, v152, s5
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v131, v[16:17]
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_perm_b32 v138, v155, v153, s8
+  ; GCN-NEXT:    v_perm_b32 v139, v159, v157, s8
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v132, v[18:19]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[136:137], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v140
+  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    ; implicit-def: $vgpr19
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v133, v[44:45]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v134, v[138:139]
+  ; GCN-NEXT:    v_add_u32_e32 v19, v128, v19
+  ; GCN-NEXT:    v_add_u32_e32 v17, v128, v17
+  ; GCN-NEXT:    v_add_u32_e32 v18, v128, v18
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v19, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[46:47], v[136:137], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v16
+  ; GCN-NEXT:    ; implicit-def: $vgpr16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v167
+  ; GCN-NEXT:    v_add_u32_e32 v16, v128, v16
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v16, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[150:151], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[152:153], v18, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v135
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v169
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v155
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v168, v154
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v135 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v21
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v20, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[136:137], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v24, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v156
+  ; GCN-NEXT:    v_fma_f32 v147, s4, v25, -v130
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v146
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v149, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v135 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v135 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v148
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v149
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v24, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v147
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v128, v17
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v27, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v154
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[24:25], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v28, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v147, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v29, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v28, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v147
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[24:25], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v46
+  ; GCN-NEXT:    v_pack_b32_f16 v45, v29, v44
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[24:25], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v47, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v44, v26, v27
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[136:137], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v136, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v137, v47
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[138:139], v[24:25], v[112:127]
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v30
+  ; GCN-NEXT:    v_exp_f32_e32 v138, v24
+  ; GCN-NEXT:    ds_read_b128 v[24:27], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[44:45], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v31
+  ; GCN-NEXT:    v_exp_f32_e32 v139, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v136
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[44:45], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v144, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v20, v137, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v153, v151, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v145, v144
+  ; GCN-NEXT:    v_pack_b32_f16 v21, v16, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[24:25], v[44:45], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v137, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v130
+  ; GCN-NEXT:    v_perm_b32 v2, v142, v140, s8
+  ; GCN-NEXT:    v_perm_b32 v16, v143, v141, s5
+  ; GCN-NEXT:    v_perm_b32 v1, v152, v150, s5
+  ; GCN-NEXT:    v_perm_b32 v3, v152, v150, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[20:21], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v0
+  ; GCN-NEXT:    v_perm_b32 v0, v142, v140, s5
+  ; GCN-NEXT:    v_perm_b32 v18, v143, v141, s8
+  ; GCN-NEXT:    v_perm_b32 v19, v153, v151, s8
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v131, v[0:1]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v132, v[2:3]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v133, v[16:17]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v134, v[18:19]
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v4
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[20:21], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v140, v24
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v157
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v140
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v145, v22
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v4, v23
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[26:27], v[20:21], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v135
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v6
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v135 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[28:29], v[44:45], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v5
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v8, -v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v26
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[30:31], v[20:21], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v29, v4
+  ; GCN-NEXT:    v_fma_f32 v30, s4, v9, -v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v29
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[24:25], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v135 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v135 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v27
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[16:17], v[24:25], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v31
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v8, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[24:25], v[96:111]
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v28, v1
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v11, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v17
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[8:9], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v12, -v130
+  ; GCN-NEXT:    v_exp_f32_e32 v28, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v13, -v130
+  ; GCN-NEXT:    v_mul_f32_e32 v12, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v28
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[18:19], v[8:9], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[8:9], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[20:21], v[24:25], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v21, v12
+  ; GCN-NEXT:    v_pack_b32_f16 v12, v10, v11
+  ; GCN-NEXT:    v_pack_b32_f16 v13, v13, v20
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[22:23], v[8:9], v[112:127]
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v14
+  ; GCN-NEXT:    v_exp_f32_e32 v20, v8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v19
+  ; GCN-NEXT:    ds_read_b128 v[8:11], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[12:13], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v15
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v22
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[4:5], v[12:13], v[80:95]
+  ; GCN-NEXT:    v_add_f32_e32 v4, 0, v49
+  ; GCN-NEXT:    v_add_f32_e32 v4, v50, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v51, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v52, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v53, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v54, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v55, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v56, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v58, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v59, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v60, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v61, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v62, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v63, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v32, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v33, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v34, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v35, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v36, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v37, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v38, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v39, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v40, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v41, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v42, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v43, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v163, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v165, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v166, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v160, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v161, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v162, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v164, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v167, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v169, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v155, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v156, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v148, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v146, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v149, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v154, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v128, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v147, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v46, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v47, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v136, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v138, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v139, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v144, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v137, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v157, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v140, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v26, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v27, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v29, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v31, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v16, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v17, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v28, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v18, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v14, v15
+  ; GCN-NEXT:    v_add_f32_e32 v4, v19, v4
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[8:9], v[12:13], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v21, v4
+  ; GCN-NEXT:    v_add_f32_e32 v2, v20, v2
+  ; GCN-NEXT:    v_add_f32_e32 v2, v22, v2
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v129, v2
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v129, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[6:7], v[0:1], v[80:95]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            largeInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+    %11:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %443:sgpr_128 = IMPLICIT_DEF
+    %18:sreg_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %391:vreg_128_align2 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %392:vreg_128_align2 = IMPLICIT_DEF
+    %401:vreg_128_align2 = IMPLICIT_DEF
+    %406:vreg_128_align2 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %473:sgpr_128 = IMPLICIT_DEF
+    %411:vreg_128_align2 = IMPLICIT_DEF
+    %416:vreg_128_align2 = IMPLICIT_DEF
+    %421:vreg_128_align2 = IMPLICIT_DEF
+    %426:vreg_128_align2 = IMPLICIT_DEF
+    %1114:sgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %484:sreg_64_xexec = IMPLICIT_DEF
+    %3346:vgpr_32 = IMPLICIT_DEF
+    %1422:sreg_32 = IMPLICIT_DEF
+    %1424:sreg_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %494:sreg_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %50:sreg_32 = IMPLICIT_DEF
+    %3347:vgpr_32 = IMPLICIT_DEF
+    %3329:vgpr_32 = IMPLICIT_DEF
+    %3330:vgpr_32 = IMPLICIT_DEF
+    %3331:vgpr_32 = IMPLICIT_DEF
+    %3332:vgpr_32 = IMPLICIT_DEF
+    %3333:vgpr_32 = IMPLICIT_DEF
+    %2986:vreg_512_align2 = IMPLICIT_DEF
+    %3038:vreg_512_align2 = IMPLICIT_DEF
+    %2980:vreg_512_align2 = IMPLICIT_DEF
+    %3003:vreg_512_align2 = IMPLICIT_DEF
+    %3334:vgpr_32 = IMPLICIT_DEF
+    %3335:vgpr_32 = IMPLICIT_DEF
+    %3336:vgpr_32 = IMPLICIT_DEF
+    %3337:vgpr_32 = IMPLICIT_DEF
+    %3338:vgpr_32 = IMPLICIT_DEF
+    %3339:vgpr_32 = IMPLICIT_DEF
+    %3345:vgpr_32 = IMPLICIT_DEF
+    %3340:vgpr_32 = IMPLICIT_DEF
+    %3341:vgpr_32 = IMPLICIT_DEF
+    %3342:vgpr_32 = IMPLICIT_DEF
+    %3343:vgpr_32 = IMPLICIT_DEF
+    %3344:vgpr_32 = IMPLICIT_DEF
+    %84:vgpr_32 = COPY %3347
+    %86:vgpr_32 = COPY %3347:vgpr_32
+    IGLP_OPT 2
+    %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec
+    %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec
+    %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec
+    %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec
+    %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec
+    %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc
+    %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec
+    %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec
+    %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec
+    %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec
+    %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec
+    %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec
+    %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec
+    %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec
+    %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec
+    %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec
+    %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec
+    %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec
+    %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec
+    %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec
+    %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec
+    %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec
+    %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec
+    %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec
+    %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec
+    %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec
+    %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec
+    %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec
+    %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec
+    %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec
+    %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec
+    %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec
+    %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec
+    %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec
+    %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec
+    %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec
+    %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec
+    %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec
+    %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec
+    %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec
+    %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec
+    %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec
+    %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec
+    %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec
+    %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec
+    %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec
+    %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec
+    %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec
+    %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec
+    %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec
+    %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec
+    %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec
+    %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec
+    %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec
+    %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec
+    %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec
+    %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec
+    %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec
+    %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec
+    %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec
+    %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec
+    %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec
+    %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec
+    %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec
+    %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec
+    %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec
+    %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec
+    %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec
+    %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec
+    %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec
+    %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec
+    %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec
+    %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec
+    %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec
+    %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec
+    %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec
+    %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec
+    %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec
+    %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec
+    %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec
+    %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec
+    %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec
+    %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec
+    %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec
+    %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec
+    %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec
+    %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec
+    %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec
+    %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec
+    %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec
+    %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec
+    %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec
+    %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec
+    %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec
+    %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec
+    %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec
+    %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec
+    %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec
+    %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec
+    %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec
+    %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec
+    %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec
+    %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec
+    %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec
+    %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec
+    %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec
+    %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec
+    %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec
+    %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec
+    %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec
+    %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec
+    %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec
+    %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec
+    %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec
+    %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec
+    %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec
+    %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec
+    %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec
+    %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec
+    %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec
+    %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec
+    %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec
+    %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec
+    %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec
+    %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec
+    %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec
+    %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec
+    %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec
+    %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec
+    %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec
+    %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec
+    %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec
+    %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec
+    %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec
+    %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec
+    %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec
+    %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec
+    %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec
+    %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec
+    %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec
+    %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec
+    %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec
+    %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec
+    %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec
+    %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec
+    %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec
+    %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec
+    %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec
+    %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec
+    %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec
+    %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec
+    %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec
+    %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec
+    %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec
+    %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec
+    %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec
+    %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec
+    %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec
+    %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec
+    %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec
+    %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec
+    %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec
+    %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec
+    %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec
+    %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec
+    %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec
+    %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec
+    %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec
+    %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec
+    %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec
+    %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec
+    %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec
+    %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec
+    %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec
+    %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec
+    %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec
+    %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec
+    %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec
+    %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec
+    %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec
+    %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec
+    %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec
+    %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec
+    %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec
+    %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec
+    %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec
+    %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec
+    %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec
+    %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec
+    %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec
+    %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec
+    %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec
+    %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec
+    %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec
+    %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec
+    %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec
+    %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec
+    %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec
+    %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec
+    %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec
+    %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec
+    %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec
+    %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec
+    %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec
+    %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec
+    %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec
+    %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec
+    %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec
+    %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec
+    %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec
+    %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec
+    %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec
+    %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec
+    %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec
+    %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec
+    %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec
+    %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec
+    %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec
+    %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec
+    %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec
+    %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec
+    %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec
+    %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec
+    undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec
+    %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec
+    %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec
+    %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec
+    %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec
+    %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec
+    %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec
+    %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec
+    %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec
+    %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec
+    %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec
+    %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec
+    %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec
+    %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec
+    %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec
+    %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec
+    %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec
+    %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec
+    %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec
+    %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec
+    %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec
+    %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec
+    %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec
+    %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec
+    %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec
+    %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec
+    %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec
+    %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec
+    %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec
+    %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec
+    %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec
+    %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec
+    %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec
+    %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec
+    %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec
+    %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec
+    %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec
+    %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec
+    %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec
+    %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec
+    %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec
+    %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec
+    %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec
+    %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec
+    %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec
+    %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec
+    %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec
+    %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec
+    %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec
+    %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec
+    %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec
+    %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec
+    %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec
+    %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec
+    %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec
+    %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec
+    %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec
+    %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec
+    %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec
+    %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec
+    %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec
+    %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec
+    %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec
+    %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec
+    %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec
+    %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec
+    %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec
+    %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec
+    %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec
+    %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec
+    %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec
+    undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec
+    %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec
+    %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec
+    %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec
+    %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec
+    %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec
+    %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec
+    %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec
+    %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec
+    %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec
+    %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec
+    %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec
+    %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec
+    %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec
+    %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec
+    %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec
+    %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec
+    %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec
+    %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec
+    %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec
+    %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec
+    %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec
+    %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec
+    %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec
+    %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec
+    %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec
+    %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec
+    %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec
+    %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec
+    %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec
+    %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec
+    %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
new file mode 100644
index 00000000000000..3e467af66590a4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @smallInterleave() #0 { ret void }
+  ; GCN-LABEL: smallInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
+  ; GCN-NEXT:    ; implicit-def: $sgpr4
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr54
+  ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; implicit-def: $vgpr37
+  ; GCN-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    ; implicit-def: $vgpr55
+  ; GCN-NEXT:    ; implicit-def: $vgpr88
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr5
+  ; GCN-NEXT:    v_add_lshl_u32 v36, v5, s4, 1
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v36, v[0:3]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v4, s[0:3], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v54
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v54
+  ; GCN-NEXT:    buffer_load_dwordx2 v[24:25], v0, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[26:27], v1, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v37
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v37 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr1
+  ; GCN-NEXT:    ; kill: killed $vgpr0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[20:21], v[28:29], 0
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    ; implicit-def: $sgpr3
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[28:29], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[22:23], v[30:31], v[38:53]
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v55
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[30:31], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v55 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v36, v[16:19]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[20:21], v[28:29], v[38:53]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v37
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[28:29], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[22:23], v[30:31], v[38:53]
+  ; GCN-NEXT:    ; implicit-def: $vgpr20_vgpr21_vgpr22_vgpr23
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[30:31], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v37 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr34
+  ; GCN-NEXT:    ; implicit-def: $vgpr35
+  ; GCN-NEXT:    ; implicit-def: $vgpr37
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[28:29], v[20:21], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[16:17], v[20:21], v[38:53]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[30:31], v[22:23], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v55
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[18:19], v[22:23], v[38:53]
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v55 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr55
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[28:29], v[16:17], v[38:53]
+  ; GCN-NEXT:    ; implicit-def: $vgpr29
+  ; GCN-NEXT:    ; implicit-def: $vgpr28
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[20:21], v[16:17], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v21, s20, v29
+  ; GCN-NEXT:    v_and_b32_e32 v21, 0x1fffffff, v21
+  ; GCN-NEXT:    v_mul_lo_u32 v21, v21, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v68, v32, v21, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v73, v33, 1, v68
+  ; GCN-NEXT:    v_lshl_add_u32 v74, v34, 1, v73
+  ; GCN-NEXT:    v_lshl_add_u32 v75, v35, 1, v74
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[38:53], v[30:31], v[18:19], v[38:53]
+  ; GCN-NEXT:    v_perm_b32 v21, v26, v24, s2
+  ; GCN-NEXT:    v_perm_b32 v24, v26, v24, s3
+  ; GCN-NEXT:    v_perm_b32 v26, v27, v25, s2
+  ; GCN-NEXT:    v_perm_b32 v25, v27, v25, s3
+  ; GCN-NEXT:    v_add_u32_e32 v17, v36, v54
+  ; GCN-NEXT:    v_add_u32_e32 v20, v37, v54
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v68, v21
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v73, v24
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v74, v26
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v75, v25
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[56:57], v17, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[58:59], v20, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[22:23], v[18:19], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v18, s4, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v19, s4, v39
+  ; GCN-NEXT:    v_mul_f32_e32 v22, s4, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v23, s4, v41
+  ; GCN-NEXT:    v_max3_f32 v18, v18, s5, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v27, s4, v42
+  ; GCN-NEXT:    v_mul_f32_e32 v29, s4, v43
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v22, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v30, s4, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v31, s4, v45
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v27, v29
+  ; GCN-NEXT:    v_mul_f32_e32 v32, s4, v46
+  ; GCN-NEXT:    v_mul_f32_e32 v33, s4, v47
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v30, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v34, s4, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v35, s4, v49
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v32, v33
+  ; GCN-NEXT:    v_mul_f32_e32 v36, s4, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v19, s4, v51
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v34, v35
+  ; GCN-NEXT:    v_mul_f32_e32 v22, s4, v52
+  ; GCN-NEXT:    v_mul_f32_e32 v23, s4, v53
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v36, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v27, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v29, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v22, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v30, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v31, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v27, v29
+  ; GCN-NEXT:    v_mul_f32_e32 v32, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v33, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v30, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v34, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v35, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v32, v33
+  ; GCN-NEXT:    v_mul_f32_e32 v19, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v36, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v34, v35
+  ; GCN-NEXT:    v_mul_f32_e32 v22, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v23, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v19, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v27, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v29, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v22, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v30, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v31, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v27, v29
+  ; GCN-NEXT:    v_max3_f32 v18, v18, v30, v31
+  ; GCN-NEXT:    ds_bpermute_b32 v19, v55, v18
+  ; GCN-NEXT:    ; kill: killed $vgpr17
+  ; GCN-NEXT:    v_max_f32_e32 v16, v28, v28
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[78:81], v88
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr20
+  ; GCN-NEXT:    ds_read_b128 v[82:85], v88 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_max_f32_e32 v19, v19, v19
+  ; GCN-NEXT:    v_max_f32_e32 v18, v18, v19
+  ; GCN-NEXT:    ds_bpermute_b32 v19, v55, v18
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v17, v19, v18, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v17, v17, v17
+  ; GCN-NEXT:    v_max_f32_e32 v76, v16, v17
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v38, -v76
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v39, -v76
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v40, -v76
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v41, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v42, -v76
+  ; GCN-NEXT:    v_sub_f32_e32 v26, v28, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v44, -v76
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v45, -v76
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v60
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v62
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v63
+  ; GCN-NEXT:    v_exp_f32_e32 v54, v20
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v43, -v76
+  ; GCN-NEXT:    v_exp_f32_e32 v66, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v46, -v76
+  ; GCN-NEXT:    v_fma_f32 v25, s4, v47, -v76
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v21
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v76
+  ; GCN-NEXT:    v_pack_b32_f16 v87, v18, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v86, v16, v17
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v48
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[86:87], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v66
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v48, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v25
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    v_pack_b32_f16 v79, v78, v48
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v49, -v76
+  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[54:55] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_exp_f32_e32 v69, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v70, v70
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+  ; GCN-NEXT:    v_pack_b32_f16 v78, v72, v77
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v48
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v50, -v76
+  ; GCN-NEXT:    v_fma_f32 v49, s4, v51, -v76
+  ; GCN-NEXT:    v_fma_f32 v52, s4, v52, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v49, 0x3fb8aa3b, v49
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[78:79], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v80, v70
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v49
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v52
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v71
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v52, v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[78:79], v[16:31]
+  ; GCN-NEXT:    v_pack_b32_f16 v86, v77, v80
+  ; GCN-NEXT:    v_fma_f32 v53, s4, v53, -v76
+  ; GCN-NEXT:    v_pack_b32_f16 v87, v49, v52
+  ; GCN-NEXT:    ; implicit-def: $vgpr52
+  ; GCN-NEXT:    ds_read_b128 v[78:81], v52
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[82:85], v52 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v49, 0x3fb8aa3b, v53
+  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v76
+  ; GCN-NEXT:    v_exp_f32_e32 v49, v49
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v1
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v2, -v76
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[86:87], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v48
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v49
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v51
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v53, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v50
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v2, v1
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v3, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v79, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v2
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v4, -v76
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[0:1], v[32:47]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_fma_f32 v3, s4, v5, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v80, v53
+  ; GCN-NEXT:    v_exp_f32_e32 v4, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v77
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[0:1], v[16:31]
+  ; GCN-NEXT:    v_perm_b32 v0, v58, v56, s2
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v68, v0
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v6, -v76
+  ; GCN-NEXT:    v_perm_b32 v1, v58, v56, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v73, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v78
+  ; GCN-NEXT:    v_exp_f32_e32 v58, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v79
+  ; GCN-NEXT:    v_exp_f32_e32 v5, v3
+  ; GCN-NEXT:    v_perm_b32 v3, v59, v57, s2
+  ; GCN-NEXT:    v_perm_b32 v56, v59, v57, s3
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v74, v3
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v75, v56
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_pack_b32_f16 v57, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v56, v80, v2
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v7, -v76
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v8, -v76
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v9, -v76
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[6:9], v88
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[56:57], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v7, v1
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v2
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v88 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v5
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[56:57], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v10, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v58
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v59
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v6, v68
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v11, -v76
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v14, -v76
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[8:9], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v12, -v76
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v15, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v13, -v76
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v14, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[0:1], v[16:31]
+  ; GCN-NEXT:    v_add_f32_e32 v0, 0, v60
+  ; GCN-NEXT:    v_add_f32_e32 v0, v61, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v62, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v63, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v66, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v69, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v70, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v71, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v72, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v51, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v50, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v48, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v49, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v53, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v77, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v78, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v79, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_add_f32_e32 v0, v4, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_add_f32_e32 v0, v5, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v1
+  ; GCN-NEXT:    v_add_f32_e32 v4, v58, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v52
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v73
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v6
+  ; GCN-NEXT:    v_add_f32_e32 v4, v59, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v7, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v73, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v8, v9
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v13, v15
+  ; GCN-NEXT:    v_add_f32_e32 v4, v6, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v56
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v12
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v11
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v14
+  ; GCN-NEXT:    v_add_f32_e32 v4, v14, v4
+  ; GCN-NEXT:    v_add_f32_e32 v4, v56, v4
+  ; GCN-NEXT:    v_add_f32_e32 v10, v11, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v6, v5
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v52 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[8:9], v[16:31]
+  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v12, v10
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v55, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v55, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[0:1], v[16:31]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v54
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+  %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  %1:vgpr_32 = COPY %0:vgpr_32
+  %2:vgpr_32 = IMPLICIT_DEF
+  %3:sreg_32 = IMPLICIT_DEF
+  %4:vreg_64_align2 = IMPLICIT_DEF
+  %5:sgpr_128 = IMPLICIT_DEF
+  %6:vgpr_32 = IMPLICIT_DEF
+  %7:vgpr_32 = IMPLICIT_DEF
+  %8:sgpr_128 = IMPLICIT_DEF
+  %9:vgpr_32 = IMPLICIT_DEF
+  %10:sgpr_512 = IMPLICIT_DEF
+  %11:sgpr_32 = IMPLICIT_DEF
+  %12:sreg_64_xexec = IMPLICIT_DEF
+  %13:vgpr_32 = IMPLICIT_DEF
+  %14:sreg_32 = IMPLICIT_DEF
+  %15:sreg_32 = IMPLICIT_DEF
+  %16:vgpr_32 = IMPLICIT_DEF
+  %17:sreg_32 = IMPLICIT_DEF
+  %18:vgpr_32 = IMPLICIT_DEF
+  %19:vgpr_32 = IMPLICIT_DEF
+  %20:vgpr_32 = IMPLICIT_DEF
+  %21:vgpr_32 = IMPLICIT_DEF
+  %22:vgpr_32 = IMPLICIT_DEF
+  %23:vgpr_32 = IMPLICIT_DEF
+  %24:vgpr_32 = IMPLICIT_DEF
+  %25:vgpr_32 = IMPLICIT_DEF
+  %26:sreg_32 = IMPLICIT_DEF
+  %42:vgpr_32 = IMPLICIT_DEF
+  %44:vreg_128_align2 = IMPLICIT_DEF
+  %48:vgpr_32 = IMPLICIT_DEF
+  %49:vreg_128_align2 = IMPLICIT_DEF
+  %52:vreg_128_align2 = IMPLICIT_DEF
+  %55:vreg_128_align2 = IMPLICIT_DEF
+  %106:vgpr_32 = IMPLICIT_DEF
+  %29:vgpr_32 = IMPLICIT_DEF
+  %37:vgpr_32 = IMPLICIT_DEF
+  %259:vreg_512_align2 = IMPLICIT_DEF
+  %260:vreg_512_align2 = IMPLICIT_DEF
+  IGLP_OPT 2
+  %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec
+  %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec
+  %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec
+  %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc
+  %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec
+  %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec
+  %36:vgpr_32 = V_ADD_U32_e32 %7:vgpr_32, %37:vgpr_32, implicit $exec
+  %38:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %36:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %39:vgpr_32 = V_ADD_U32_e32 %9:vgpr_32, %37:vgpr_32, implicit $exec
+  %40:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %39:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %41:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  early-clobber %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %41.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %41.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %45:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  early-clobber %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %45.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %45.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %47:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %50:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %35:vreg_128_align2, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %51:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %53:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %54:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %56:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %57:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %58:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %59:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %60:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %61:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %62:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %63:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %64:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %65:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %66:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %67:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %68:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %69:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %70:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %71:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %72:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %73:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %74:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %75:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %76:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %77:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %78:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %79:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %80:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %81:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %82:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %83:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %84:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %85:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %86:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %87:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %88:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %89:vgpr_32 = V_MAX3_F32_e64 0, %57:vgpr_32, 0, %11:sgpr_32, 0, %58:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %90:vgpr_32 = V_MAX3_F32_e64 0, %89:vgpr_32, 0, %59:vgpr_32, 0, %60:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %91:vgpr_32 = V_MAX3_F32_e64 0, %90:vgpr_32, 0, %61:vgpr_32, 0, %62:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %92:vgpr_32 = V_MAX3_F32_e64 0, %91:vgpr_32, 0, %63:vgpr_32, 0, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %93:vgpr_32 = V_MAX3_F32_e64 0, %92:vgpr_32, 0, %65:vgpr_32, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %94:vgpr_32 = V_MAX3_F32_e64 0, %93:vgpr_32, 0, %67:vgpr_32, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %95:vgpr_32 = V_MAX3_F32_e64 0, %94:vgpr_32, 0, %69:vgpr_32, 0, %70:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %96:vgpr_32 = V_MAX3_F32_e64 0, %95:vgpr_32, 0, %71:vgpr_32, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %97:vgpr_32 = V_MAX3_F32_e64 0, %96:vgpr_32, 0, %73:vgpr_32, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %98:vgpr_32 = V_MAX3_F32_e64 0, %97:vgpr_32, 0, %75:vgpr_32, 0, %76:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %99:vgpr_32 = V_MAX3_F32_e64 0, %98:vgpr_32, 0, %77:vgpr_32, 0, %78:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %100:vgpr_32 = V_MAX3_F32_e64 0, %99:vgpr_32, 0, %79:vgpr_32, 0, %80:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %101:vgpr_32 = V_MAX3_F32_e64 0, %100:vgpr_32, 0, %81:vgpr_32, 0, %82:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %102:vgpr_32 = V_MAX3_F32_e64 0, %101:vgpr_32, 0, %83:vgpr_32, 0, %84:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %103:vgpr_32 = V_MAX3_F32_e64 0, %102:vgpr_32, 0, %85:vgpr_32, 0, %86:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %104:vgpr_32 = V_MAX3_F32_e64 0, %103:vgpr_32, 0, %87:vgpr_32, 0, %88:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %105:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %104:vgpr_32, 0, implicit $exec
+  %107:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %105:vgpr_32, %105:vgpr_32, implicit $mode, implicit $exec
+  %108:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %104:vgpr_32, %107:vgpr_32, implicit $mode, implicit $exec
+  %109:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %108:vgpr_32, 0, implicit $exec
+  %110:vgpr_32 = V_CNDMASK_B32_e64 0, %109:vgpr_32, 0, %108:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %111:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %110:vgpr_32, %110:vgpr_32, implicit $mode, implicit $exec
+  %112:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %13:vgpr_32, %13:vgpr_32, implicit $mode, implicit $exec
+  %113:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %112:vgpr_32, %111:vgpr_32, implicit $mode, implicit $exec
+  %114:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %115:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %114:vgpr_32, implicit $mode, implicit $exec
+  %116:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %115:vgpr_32, implicit $mode, implicit $exec
+  %117:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %118:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %117:vgpr_32, implicit $mode, implicit $exec
+  %119:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %118:vgpr_32, implicit $mode, implicit $exec
+  %120:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %121:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %120:vgpr_32, implicit $mode, implicit $exec
+  %122:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %121:vgpr_32, implicit $mode, implicit $exec
+  %123:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %124:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %123:vgpr_32, implicit $mode, implicit $exec
+  %125:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %124:vgpr_32, implicit $mode, implicit $exec
+  %126:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %127:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %126:vgpr_32, implicit $mode, implicit $exec
+  %128:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %127:vgpr_32, implicit $mode, implicit $exec
+  %129:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %130:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %129:vgpr_32, implicit $mode, implicit $exec
+  %131:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %130:vgpr_32, implicit $mode, implicit $exec
+  %132:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %133:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %132:vgpr_32, implicit $mode, implicit $exec
+  %134:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %133:vgpr_32, implicit $mode, implicit $exec
+  %135:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %136:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %135:vgpr_32, implicit $mode, implicit $exec
+  %137:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %136:vgpr_32, implicit $mode, implicit $exec
+  %138:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %139:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %138:vgpr_32, implicit $mode, implicit $exec
+  %140:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %139:vgpr_32, implicit $mode, implicit $exec
+  %141:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %142:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %141:vgpr_32, implicit $mode, implicit $exec
+  %143:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %142:vgpr_32, implicit $mode, implicit $exec
+  %144:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %145:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %144:vgpr_32, implicit $mode, implicit $exec
+  %146:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %145:vgpr_32, implicit $mode, implicit $exec
+  %147:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %148:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %147:vgpr_32, implicit $mode, implicit $exec
+  %149:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %148:vgpr_32, implicit $mode, implicit $exec
+  %150:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %151:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %150:vgpr_32, implicit $mode, implicit $exec
+  %152:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %151:vgpr_32, implicit $mode, implicit $exec
+  %153:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %154:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %153:vgpr_32, implicit $mode, implicit $exec
+  %155:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %154:vgpr_32, implicit $mode, implicit $exec
+  %156:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %156:vgpr_32, implicit $mode, implicit $exec
+  %158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %157:vgpr_32, implicit $mode, implicit $exec
+  %159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %159:vgpr_32, implicit $mode, implicit $exec
+  %161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %160:vgpr_32, implicit $mode, implicit $exec
+  %162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %162:vgpr_32, implicit $mode, implicit $exec
+  %164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %163:vgpr_32, implicit $mode, implicit $exec
+  %165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %165:vgpr_32, implicit $mode, implicit $exec
+  %167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %166:vgpr_32, implicit $mode, implicit $exec
+  %168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %168:vgpr_32, implicit $mode, implicit $exec
+  %170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %169:vgpr_32, implicit $mode, implicit $exec
+  %171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %171:vgpr_32, implicit $mode, implicit $exec
+  %173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %172:vgpr_32, implicit $mode, implicit $exec
+  %174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %174:vgpr_32, implicit $mode, implicit $exec
+  %176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %175:vgpr_32, implicit $mode, implicit $exec
+  %177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %177:vgpr_32, implicit $mode, implicit $exec
+  %179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %178:vgpr_32, implicit $mode, implicit $exec
+  %180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %180:vgpr_32, implicit $mode, implicit $exec
+  %182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %181:vgpr_32, implicit $mode, implicit $exec
+  %183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %183:vgpr_32, implicit $mode, implicit $exec
+  %185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %184:vgpr_32, implicit $mode, implicit $exec
+  %186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %186:vgpr_32, implicit $mode, implicit $exec
+  %188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %187:vgpr_32, implicit $mode, implicit $exec
+  %189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %189:vgpr_32, implicit $mode, implicit $exec
+  %191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %190:vgpr_32, implicit $mode, implicit $exec
+  %192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %192:vgpr_32, implicit $mode, implicit $exec
+  %194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %193:vgpr_32, implicit $mode, implicit $exec
+  %195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %195:vgpr_32, implicit $mode, implicit $exec
+  %197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %196:vgpr_32, implicit $mode, implicit $exec
+  %198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %198:vgpr_32, implicit $mode, implicit $exec
+  %200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %199:vgpr_32, implicit $mode, implicit $exec
+  %201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %201:vgpr_32, implicit $mode, implicit $exec
+  %203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %202:vgpr_32, implicit $mode, implicit $exec
+  %204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %204:vgpr_32, implicit $mode, implicit $exec
+  %206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %205:vgpr_32, implicit $mode, implicit $exec
+  %207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %207:vgpr_32, implicit $mode, implicit $exec
+  %209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %208:vgpr_32, implicit $mode, implicit $exec
+  %210:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %116:vgpr_32, implicit $mode, implicit $exec
+  %211:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %119:vgpr_32, %210:vgpr_32, implicit $mode, implicit $exec
+  %212:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %122:vgpr_32, %211:vgpr_32, implicit $mode, implicit $exec
+  %213:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %125:vgpr_32, %212:vgpr_32, implicit $mode, implicit $exec
+  %214:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %128:vgpr_32, %213:vgpr_32, implicit $mode, implicit $exec
+  %215:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %131:vgpr_32, %214:vgpr_32, implicit $mode, implicit $exec
+  %216:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %134:vgpr_32, %215:vgpr_32, implicit $mode, implicit $exec
+  %217:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %137:vgpr_32, %216:vgpr_32, implicit $mode, implicit $exec
+  %218:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %140:vgpr_32, %217:vgpr_32, implicit $mode, implicit $exec
+  %219:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %143:vgpr_32, %218:vgpr_32, implicit $mode, implicit $exec
+  %220:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %146:vgpr_32, %219:vgpr_32, implicit $mode, implicit $exec
+  %221:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %149:vgpr_32, %220:vgpr_32, implicit $mode, implicit $exec
+  %222:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %152:vgpr_32, %221:vgpr_32, implicit $mode, implicit $exec
+  %223:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %155:vgpr_32, %222:vgpr_32, implicit $mode, implicit $exec
+  %224:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %158:vgpr_32, %223:vgpr_32, implicit $mode, implicit $exec
+  %225:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %161:vgpr_32, %224:vgpr_32, implicit $mode, implicit $exec
+  %226:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %164:vgpr_32, %225:vgpr_32, implicit $mode, implicit $exec
+  %227:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %167:vgpr_32, %226:vgpr_32, implicit $mode, implicit $exec
+  %228:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %170:vgpr_32, %227:vgpr_32, implicit $mode, implicit $exec
+  %229:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %173:vgpr_32, %228:vgpr_32, implicit $mode, implicit $exec
+  %230:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %176:vgpr_32, %229:vgpr_32, implicit $mode, implicit $exec
+  %231:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %179:vgpr_32, %230:vgpr_32, implicit $mode, implicit $exec
+  %232:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %182:vgpr_32, %231:vgpr_32, implicit $mode, implicit $exec
+  %233:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %185:vgpr_32, %232:vgpr_32, implicit $mode, implicit $exec
+  %234:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %188:vgpr_32, %233:vgpr_32, implicit $mode, implicit $exec
+  %235:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %191:vgpr_32, %234:vgpr_32, implicit $mode, implicit $exec
+  %236:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %194:vgpr_32, %235:vgpr_32, implicit $mode, implicit $exec
+  %237:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %197:vgpr_32, %236:vgpr_32, implicit $mode, implicit $exec
+  %238:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %200:vgpr_32, %237:vgpr_32, implicit $mode, implicit $exec
+  %239:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %203:vgpr_32, %238:vgpr_32, implicit $mode, implicit $exec
+  %240:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %206:vgpr_32, %239:vgpr_32, implicit $mode, implicit $exec
+  %241:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %209:vgpr_32, %240:vgpr_32, implicit $mode, implicit $exec
+  %242:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %241:vgpr_32, 0, implicit $exec
+  %243:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %241:vgpr_32, %242:vgpr_32, implicit $mode, implicit $exec
+  %244:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %243:vgpr_32, 0, implicit $exec
+  %0:vgpr_32 = V_CNDMASK_B32_e64 0, %244:vgpr_32, 0, %243:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %245:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %13:vgpr_32, %113:vgpr_32, implicit $mode, implicit $exec
+  %246:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %245:vgpr_32, implicit $mode, implicit $exec
+  undef %247.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %246:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %248:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec
+  %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec
+  %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec
+  %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %248:vgpr_32, 0, 0, implicit $exec
+  %256:vgpr_32 = V_LSHL_ADD_U32_e64 %19:vgpr_32, 1, %255:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %249:vgpr_32, 0, 0, implicit $exec
+  %257:vgpr_32 = V_LSHL_ADD_U32_e64 %20:vgpr_32, 1, %256:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %250:vgpr_32, 0, 0, implicit $exec
+  %258:vgpr_32 = V_LSHL_ADD_U32_e64 %21:vgpr_32, 1, %257:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %251:vgpr_32, 0, 0, implicit $exec
+  %0:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %1:vgpr_32, %247.sub0:vreg_64_align2, %0:vgpr_32, implicit $mode, implicit $exec
+  %259.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %261:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %116:vgpr_32, implicit $mode, implicit $exec
+  %262:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %119:vgpr_32, implicit $mode, implicit $exec
+  %263:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %122:vgpr_32, implicit $mode, implicit $exec
+  %264:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %128:vgpr_32, implicit $mode, implicit $exec
+  %265:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %131:vgpr_32, implicit $mode, implicit $exec
+  %266:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %134:vgpr_32, implicit $mode, implicit $exec
+  %267:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %140:vgpr_32, implicit $mode, implicit $exec
+  %268:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %143:vgpr_32, implicit $mode, implicit $exec
+  %269:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %146:vgpr_32, implicit $mode, implicit $exec
+  %270:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %152:vgpr_32, implicit $mode, implicit $exec
+  %271:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %155:vgpr_32, implicit $mode, implicit $exec
+  %272:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %158:vgpr_32, implicit $mode, implicit $exec
+  %273:vgpr_32 = V_ADD_U32_e32 %22:vgpr_32, %37:vgpr_32, implicit $exec
+  %274:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %273:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %275:vgpr_32 = V_ADD_U32_e32 %23:vgpr_32, %37:vgpr_32, implicit $exec
+  %276:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %275:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %277:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %278:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %279:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %280:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %281:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %282:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %283:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %284:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %281:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %282:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %283:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %284:vgpr_32, 0, 0, implicit $exec
+  %285:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %125:vgpr_32, implicit $mode, implicit $exec
+  %286:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %137:vgpr_32, implicit $mode, implicit $exec
+  %287:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %149:vgpr_32, implicit $mode, implicit $exec
+  %288:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %161:vgpr_32, implicit $mode, implicit $exec
+  undef %289.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %263:vgpr_32, 0, %285:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %289.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %261:vgpr_32, 0, %262:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %290.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %266:vgpr_32, 0, %286:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %290.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %264:vgpr_32, 0, %265:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %291.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %269:vgpr_32, 0, %287:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %291.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %267:vgpr_32, 0, %268:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %292.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %272:vgpr_32, 0, %288:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %292.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %270:vgpr_32, 0, %271:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %293:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %209:vgpr_32, implicit $mode, implicit $exec
+  %294:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %203:vgpr_32, implicit $mode, implicit $exec
+  %295:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %197:vgpr_32, implicit $mode, implicit $exec
+  %296:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %191:vgpr_32, implicit $mode, implicit $exec
+  %297:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %185:vgpr_32, implicit $mode, implicit $exec
+  %298:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %179:vgpr_32, implicit $mode, implicit $exec
+  %299:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %173:vgpr_32, implicit $mode, implicit $exec
+  %300:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %167:vgpr_32, implicit $mode, implicit $exec
+  %301:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %206:vgpr_32, implicit $mode, implicit $exec
+  %302:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %200:vgpr_32, implicit $mode, implicit $exec
+  %303:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %194:vgpr_32, implicit $mode, implicit $exec
+  %304:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %188:vgpr_32, implicit $mode, implicit $exec
+  %305:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %182:vgpr_32, implicit $mode, implicit $exec
+  %306:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %176:vgpr_32, implicit $mode, implicit $exec
+  %307:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %170:vgpr_32, implicit $mode, implicit $exec
+  %308:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %164:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  undef %309.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %307:vgpr_32, 0, %299:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %309.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %308:vgpr_32, 0, %300:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %310.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %305:vgpr_32, 0, %297:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %310.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %306:vgpr_32, 0, %298:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %311.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %303:vgpr_32, 0, %295:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %311.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %304:vgpr_32, 0, %296:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %312.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %301:vgpr_32, 0, %293:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %312.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %302:vgpr_32, 0, %294:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %313:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %314:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %315:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %316:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %37:vgpr_32 = V_ADD_U32_e32 %26:sreg_32, %37:vgpr_32, implicit $exec
+  %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec
+  S_ENDPGM 0
+...
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir
new file mode 100644
index 00000000000000..71c085968cccb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir
@@ -0,0 +1,646 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @tinyInterleave() #0 { ret void }
+  ; GCN-LABEL: tinyInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_mov_b32_e32 v96, v9
+  ; GCN-NEXT:    v_mov_b32_e32 v98, v13
+  ; GCN-NEXT:    v_max_f32_e32 v9, v13, v13
+  ; GCN-NEXT:    v_max_f32_e32 v13, v29, v29
+  ; GCN-NEXT:    ; implicit-def: $vgpr84
+  ; GCN-NEXT:    ; implicit-def: $vgpr85
+  ; GCN-NEXT:    v_max_f32_e32 v9, v84, v9
+  ; GCN-NEXT:    v_max_f32_e32 v13, v85, v13
+  ; GCN-NEXT:    v_max3_f32 v13, v13, v30, v31
+  ; GCN-NEXT:    v_max3_f32 v9, v9, v14, v15
+  ; GCN-NEXT:    ; implicit-def: $vgpr116
+  ; GCN-NEXT:    v_mov_b32_e32 v97, v25
+  ; GCN-NEXT:    v_mov_b32_e32 v100, v14
+  ; GCN-NEXT:    ds_bpermute_b32 v14, v116, v13
+  ; GCN-NEXT:    ds_bpermute_b32 v25, v116, v9
+  ; GCN-NEXT:    ; implicit-def: $vgpr92_vgpr93
+  ; GCN-NEXT:    v_mov_b32_e32 v90, v15
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    v_mov_b32_e32 v101, v30
+  ; GCN-NEXT:    v_mov_b32_e32 v91, v31
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+  ; GCN-NEXT:    v_max3_f32 v15, v93, v13, v14
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_max3_f32 v14, v92, v9, v25
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], s[4:5], v[14:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    v_mov_b32_e32 v88, v78
+  ; GCN-NEXT:    v_fma_f32 v9, s5, v64, -v30
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v9
+  ; GCN-NEXT:    v_fma_f32 v9, s5, v65, -v30
+  ; GCN-NEXT:    v_fma_f32 v13, s5, v76, -v30
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v13
+  ; GCN-NEXT:    v_fma_f32 v9, s5, v66, -v30
+  ; GCN-NEXT:    v_fma_f32 v7, s5, v7, -v30
+  ; GCN-NEXT:    v_exp_f32_e32 v14, v7
+  ; GCN-NEXT:    v_fma_f32 v7, s5, v8, -v30
+  ; GCN-NEXT:    v_fma_f32 v13, s5, v48, -v31
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v9
+  ; GCN-NEXT:    v_fma_f32 v9, s5, v49, -v31
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v122, v86
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v13
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v64
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v9
+  ; GCN-NEXT:    v_fma_f32 v13, s5, v60, -v31
+  ; GCN-NEXT:    v_fma_f32 v60, s5, v61, -v31
+  ; GCN-NEXT:    v_mov_b32_e32 v94, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v13
+  ; GCN-NEXT:    v_fma_f32 v13, s5, v23, -v31
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v85
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v13
+  ; GCN-NEXT:    v_fma_f32 v13, s5, v24, -v31
+  ; GCN-NEXT:    v_pk_fma_f32 v[24:25], s[4:5], v[96:97], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_fma_f32 v121, s5, v77, -v30
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v66, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v25
+  ; GCN-NEXT:    v_exp_f32_e32 v76, v24
+  ; GCN-NEXT:    v_mov_b32_e32 v24, v10
+  ; GCN-NEXT:    v_mov_b32_e32 v25, v26
+  ; GCN-NEXT:    v_mov_b32_e32 v26, v11
+  ; GCN-NEXT:    v_pk_fma_f32 v[24:25], s[4:5], v[24:25], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[10:11], s[4:5], v[26:27], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_mov_b32_e32 v13, v28
+  ; GCN-NEXT:    v_mov_b32_e32 v99, v29
+  ; GCN-NEXT:    v_exp_f32_e32 v97, v25
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v24
+  ; GCN-NEXT:    v_exp_f32_e32 v103, v11
+  ; GCN-NEXT:    v_exp_f32_e32 v102, v10
+  ; GCN-NEXT:    v_pk_fma_f32 v[10:11], s[4:5], v[12:13], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_fma_f32 v9, s5, v50, -v31
+  ; GCN-NEXT:    v_exp_f32_e32 v29, v11
+  ; GCN-NEXT:    v_exp_f32_e32 v28, v10
+  ; GCN-NEXT:    v_pk_fma_f32 v[10:11], s[4:5], v[98:99], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_mov_b32_e32 v24, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v99, v11
+  ; GCN-NEXT:    v_mov_b32_e32 v11, v52
+  ; GCN-NEXT:    v_mov_b32_e32 v52, v69
+  ; GCN-NEXT:    v_mov_b32_e32 v25, v56
+  ; GCN-NEXT:    v_mov_b32_e32 v56, v73
+  ; GCN-NEXT:    v_pk_fma_f32 v[72:73], s[4:5], v[52:53], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v52, v65
+  ; GCN-NEXT:    v_exp_f32_e32 v98, v10
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
+  ; GCN-NEXT:    v_mov_b32_e32 v95, v51
+  ; GCN-NEXT:    v_mov_b32_e32 v10, v68
+  ; GCN-NEXT:    v_mov_b32_e32 v12, v70
+  ; GCN-NEXT:    v_mov_b32_e32 v13, v54
+  ; GCN-NEXT:    v_mov_b32_e32 v54, v71
+  ; GCN-NEXT:    v_pk_fma_f32 v[68:69], s[4:5], v[94:95], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[70:71], s[4:5], v[10:11], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v69, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v71
+  ; GCN-NEXT:    v_pk_fma_f32 v[92:93], s[4:5], v[92:93], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[94:95], s[4:5], v[54:55], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v71
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v73
+  ; GCN-NEXT:    v_mov_b32_e32 v26, v74
+  ; GCN-NEXT:    v_mov_b32_e32 v27, v58
+  ; GCN-NEXT:    v_mov_b32_e32 v58, v75
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v73
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v93
+  ; GCN-NEXT:    v_pk_fma_f32 v[74:75], s[4:5], v[12:13], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr104
+  ; GCN-NEXT:    ds_read_b128 v[10:13], v104 offset:4352
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[50:51] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_pk_fma_f32 v[48:49], s[4:5], v[100:101], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[100:101], s[4:5], v[24:25], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pack_b32_f16 v51, v51, v50
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v55, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v95
+  ; GCN-NEXT:    v_pack_b32_f16 v50, v23, v61
+  ; GCN-NEXT:    v_pk_fma_f32 v[104:105], s[4:5], v[56:57], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_mov_b32_e32 v89, v62
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v93
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v101
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[10:11], v[50:51], v[32:47]
+  ; GCN-NEXT:    v_mov_b32_e32 v62, v79
+  ; GCN-NEXT:    v_pk_fma_f32 v[106:107], s[4:5], v[26:27], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v95
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v105
+  ; GCN-NEXT:    v_pk_fma_f32 v[108:109], s[4:5], v[58:59], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v55, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v53, v54
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v56, v79
+  ; GCN-NEXT:    v_exp_f32_e32 v101, v107
+  ; GCN-NEXT:    ; implicit-def: $vgpr110
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[12:13], v[24:25], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[24:27], v110 offset:4352
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pk_fma_f32 v[88:89], s[4:5], v[88:89], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v101
+  ; GCN-NEXT:    v_exp_f32_e32 v105, v109
+  ; GCN-NEXT:    v_pk_fma_f32 v[110:111], s[4:5], v[62:63], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v23, v60
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v89
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v105
+  ; GCN-NEXT:    v_exp_f32_e32 v107, v111
+  ; GCN-NEXT:    v_pack_b32_f16 v50, v50, v56
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v51, v51, v53
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v55, v89
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v107
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[50:51], v[32:47]
+  ; GCN-NEXT:    v_pack_b32_f16 v114, v52, v54
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v68
+  ; GCN-NEXT:    v_pack_b32_f16 v115, v55, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v113, v49
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v48
+  ; GCN-NEXT:    v_pk_fma_f32 v[48:49], s[4:5], v[90:91], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v70, v70
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[26:27], v[114:115], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v49
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v48
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v74
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    s_nop 5
+  ; GCN-NEXT:    v_exp_f32_e32 v32, v92
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v94
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v100
+  ; GCN-NEXT:    v_exp_f32_e32 v100, v106
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[32:33] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v106, v110
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr82_vgpr83
+  ; GCN-NEXT:    ; implicit-def: $sgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr117
+  ; GCN-NEXT:    ; implicit-def: $vgpr119
+  ; GCN-NEXT:    ; implicit-def: $vgpr118
+  ; GCN-NEXT:    ; implicit-def: $vgpr120
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v7, v32
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v78, v122
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v92
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v74
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[32:33], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v35, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v11, v36, v7
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v104
+  ; GCN-NEXT:    v_exp_f32_e32 v104, v108
+  ; GCN-NEXT:    v_pack_b32_f16 v10, v34, v35
+  ; GCN-NEXT:    v_mov_b32_e32 v7, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v121
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[12:13], v[10:11], v[48:63]
+  ; GCN-NEXT:    v_pk_add_f32 v[10:11], v[84:85], v[86:87]
+  ; GCN-NEXT:    v_pk_fma_f32 v[6:7], s[4:5], v[6:7], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[10:11]
+  ; GCN-NEXT:    v_mov_b32_e32 v10, v0
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[68:69], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v11, v16
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[70:71], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v16, v1
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[72:73], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[74:75], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v1, v18
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[92:93], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v18, v3
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[94:95], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v2, v4
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[78:79], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v3, v20
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[100:101], v[8:9]
+  ; GCN-NEXT:    v_mov_b32_e32 v20, v5
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[104:105], v[8:9]
+  ; GCN-NEXT:    v_pk_fma_f32 v[4:5], s[4:5], v[10:11], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[10:11], s[4:5], v[16:17], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v5, v5
+  ; GCN-NEXT:    v_exp_f32_e32 v4, v4
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[64:65], v[8:9]
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v10
+  ; GCN-NEXT:    v_pk_fma_f32 v[0:1], s[4:5], v[0:1], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[22:23], v[8:9]
+  ; GCN-NEXT:    v_exp_f32_e32 v1, v1
+  ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+  ; GCN-NEXT:    v_pk_fma_f32 v[12:13], s[4:5], v[18:19], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[88:89], v[8:9]
+  ; GCN-NEXT:    v_exp_f32_e32 v13, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v12
+  ; GCN-NEXT:    v_pk_fma_f32 v[2:3], s[4:5], v[2:3], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[8:9], v[106:107], v[8:9]
+  ; GCN-NEXT:    v_exp_f32_e32 v3, v3
+  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
+  ; GCN-NEXT:    v_pk_fma_f32 v[16:17], s[4:5], v[20:21], v[30:31] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[8:9]
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v16
+  ; GCN-NEXT:    v_pk_add_f32 v[4:5], v[10:11], v[4:5]
+  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[12:13], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[2:3], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[16:17], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[6:7], v[0:1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v106
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v88
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v104
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v100
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v94
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[14:15], v[0:1]
+  ; GCN-NEXT:    v_pack_b32_f16 v3, v37, v2
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[66:67], v[0:1]
+  ; GCN-NEXT:    v_pack_b32_f16 v2, v32, v33
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[76:77], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[96:97], v[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[24:25], v[2:3], v[48:63]
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[102:103], v[0:1]
+  ; GCN-NEXT:    v_perm_b32 v2, v83, v81, s0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[28:29], v[0:1]
+  ; GCN-NEXT:    v_perm_b32 v3, v83, v81, s1
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[98:99], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[112:113], v[0:1]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[0:1], v[90:91], v[0:1]
+  ; GCN-NEXT:    ds_bpermute_b32 v0, v116, v0
+  ; GCN-NEXT:    ds_bpermute_b32 v1, v116, v1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v5, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v123, v4
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v0, v82, v80, s0
+  ; GCN-NEXT:    v_perm_b32 v1, v82, v80, s1
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v117, v0
+  ; GCN-NEXT:    v_lshl_add_u32 v0, v119, 1, v117
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v118, v1
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v0, v2
+  ; GCN-NEXT:    v_lshl_add_u32 v0, v120, 1, v117
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v0, v3
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+  !0 = !{i64 2862105}
+
+...
+
+
+---
+name:            tinyInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       3
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+    %712:vgpr_32 = IMPLICIT_DEF
+    %728:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %1979:vreg_64_align2 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    %608:vreg_512_align2 = IMPLICIT_DEF
+    %621:vreg_512_align2 = IMPLICIT_DEF
+    %639:vreg_512_align2 = IMPLICIT_DEF
+    %654:vreg_512_align2 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %753:vreg_64_align2 = IMPLICIT_DEF
+    %751:vreg_64_align2 = IMPLICIT_DEF
+    %731:sreg_32 = IMPLICIT_DEF
+    %733:sreg_32 = IMPLICIT_DEF
+    %753:vreg_64_align2 = IMPLICIT_DEF
+    %746:vgpr_32 = IMPLICIT_DEF
+    %747:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %1864:vreg_512_align2 = IMPLICIT_DEF
+    %1861:vreg_512_align2 = IMPLICIT_DEF
+    IGLP_OPT 2
+    undef %1942.sub0:vreg_64_align2 = COPY %608.sub3:vreg_512_align2
+    %1942.sub1:vreg_64_align2 = COPY %621.sub3:vreg_512_align2
+    undef %1923.sub0:vreg_64_align2 = COPY %608.sub14:vreg_512_align2
+    %1923.sub1:vreg_64_align2 = COPY %621.sub14:vreg_512_align2
+    undef %1904.sub0:vreg_64_align2 = COPY %639.sub9:vreg_512_align2
+    %1904.sub1:vreg_64_align2 = COPY %654.sub9:vreg_512_align2
+    undef %1885.sub0:vreg_64_align2 = COPY %639.sub13:vreg_512_align2
+    %1885.sub1:vreg_64_align2 = COPY %654.sub13:vreg_512_align2
+    %758:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %639.sub13:vreg_512_align2, %639.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %759:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %712:vgpr_32, %758:vgpr_32, implicit $mode, implicit $exec
+    %760:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %654.sub13:vreg_512_align2, %654.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %761:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %728:vgpr_32, %760:vgpr_32, implicit $mode, implicit $exec
+    undef %1856.sub0:vreg_64_align2 = COPY %639.sub14:vreg_512_align2
+    %1856.sub1:vreg_64_align2 = COPY %654.sub14:vreg_512_align2
+    undef %1977.sub0:vreg_64_align2 = COPY %639.sub15:vreg_512_align2
+    %1977.sub1:vreg_64_align2 = COPY %654.sub15:vreg_512_align2
+    %764:vgpr_32 = V_MAX3_F32_e64 0, %761:vgpr_32, 0, %654.sub14:vreg_512_align2, 0, %654.sub15:vreg_512_align2, 0, 0, implicit $mode, implicit $exec
+    %765:vgpr_32 = V_MAX3_F32_e64 0, %759:vgpr_32, 0, %639.sub14:vreg_512_align2, 0, %639.sub15:vreg_512_align2, 0, 0, implicit $mode, implicit $exec
+    %766:vgpr_32 = DS_BPERMUTE_B32 %35:vgpr_32, %765:vgpr_32, 0, implicit $exec
+    %767:vgpr_32 = DS_BPERMUTE_B32 %35:vgpr_32, %764:vgpr_32, 0, implicit $exec
+    undef %1959.sub1:vreg_64_align2 = V_MAX3_F32_e64 0, %1979.sub1:vreg_64_align2, 0, %764:vgpr_32, 0, %767:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1959.sub0:vreg_64_align2 = V_MAX3_F32_e64 0, %1979.sub0:vreg_64_align2, 0, %765:vgpr_32, 0, %766:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %771:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %1.sub4_sub5:sgpr_512, 8, %1959:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %774:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub0:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1918.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %774:vgpr_32, implicit $mode, implicit $exec
+    %776:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub1:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1899.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %776:vgpr_32, implicit $mode, implicit $exec
+    %778:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub2:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %779:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub12:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1930.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %779:vgpr_32, implicit $mode, implicit $exec
+    %781:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub13:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %782:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %639.sub7:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1912.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %782:vgpr_32, implicit $mode, implicit $exec
+    %783:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %639.sub8:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %785:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub0:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1918.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %785:vgpr_32, implicit $mode, implicit $exec
+    %787:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub1:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1899.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %787:vgpr_32, implicit $mode, implicit $exec
+    %789:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub2:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %790:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub12:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1930.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %790:vgpr_32, implicit $mode, implicit $exec
+    %792:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub13:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %793:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %654.sub7:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1912.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %793:vgpr_32, implicit $mode, implicit $exec
+    %794:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %654.sub8:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1941.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %794:vgpr_32, implicit $mode, implicit $exec
+    %1941.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %783:vgpr_32, implicit $mode, implicit $exec
+    %798:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1904:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1922.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %798.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1922.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %798.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1903.sub0:vreg_64_align2 = COPY %639.sub10:vreg_512_align2
+    %1903.sub1:vreg_64_align2 = COPY %654.sub10:vreg_512_align2
+    %806:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1903:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1884.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %806.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1884.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %806.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %654.sub10:vreg_512_align2 = COPY %639.sub11:vreg_512_align2
+    %814:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub10_sub11:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1976.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %814.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1976.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %814.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %639.sub13:vreg_512_align2 = COPY %654.sub12:vreg_512_align2
+    %822:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %639.sub12_sub13:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1940.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %822.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1940.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %822.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %829:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1885:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1921.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %829.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1921.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %829.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %836:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1856:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1902.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %836.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1902.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %836.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %843:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1977:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1883.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %843.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1883.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %843.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1870.sub0:vreg_64_align2 = COPY %608.sub4:vreg_512_align2
+    %1870.sub1:vreg_64_align2 = COPY %621.sub4:vreg_512_align2
+    %621.sub4:vreg_512_align2 = COPY %608.sub5:vreg_512_align2
+    undef %1957.sub0:vreg_64_align2 = COPY %608.sub6:vreg_512_align2
+    %1957.sub1:vreg_64_align2 = COPY %621.sub6:vreg_512_align2
+    %621.sub6:vreg_512_align2 = COPY %608.sub7:vreg_512_align2
+    undef %1920.sub0:vreg_64_align2 = COPY %608.sub8:vreg_512_align2
+    %1920.sub1:vreg_64_align2 = COPY %621.sub8:vreg_512_align2
+    %621.sub8:vreg_512_align2 = COPY %608.sub9:vreg_512_align2
+    undef %1882.sub0:vreg_64_align2 = COPY %608.sub10:vreg_512_align2
+    %1882.sub1:vreg_64_align2 = COPY %621.sub10:vreg_512_align2
+    %621.sub10:vreg_512_align2 = COPY %608.sub11:vreg_512_align2
+    undef %1974.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %789:vgpr_32, implicit $mode, implicit $exec
+    %1974.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %778:vgpr_32, implicit $mode, implicit $exec
+    %861:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1942:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1956.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %861.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1956.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %861.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %868:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1870:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1938.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %868.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1938.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %868.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %875:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub4_sub5:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1919.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %875.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1919.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %875.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %882:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1957:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1900.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %882.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1900.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %882.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %889:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub6_sub7:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1881.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %889.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1881.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %889.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %896:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1920:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1863.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %896.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1863.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %896.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %903:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub8_sub9:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1973.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %903.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1973.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %903.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %910:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1882:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1955.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %910.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1955.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %910.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %917:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub10_sub11:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1937.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %917.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1937.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %917.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %926:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1918:vreg_64_align2, 8, %1899:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %928:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1974:vreg_64_align2, 8, %926:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %929:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1956:vreg_64_align2, 8, %928:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %930:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1938:vreg_64_align2, 8, %929:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1919:vreg_64_align2, 8, %930:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %932:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1900:vreg_64_align2, 8, %931:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %933:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1881:vreg_64_align2, 8, %932:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %934:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1863:vreg_64_align2, 8, %933:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %935:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1973:vreg_64_align2, 8, %934:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %936:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1955:vreg_64_align2, 8, %935:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %937:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1937:vreg_64_align2, 8, %936:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %621.sub14:vreg_512_align2 = COPY %608.sub15:vreg_512_align2
+    undef %1860.sub0:vreg_64_align2 = COPY %639.sub0:vreg_512_align2
+    %1860.sub1:vreg_64_align2 = COPY %654.sub0:vreg_512_align2
+    %654.sub0:vreg_512_align2 = COPY %639.sub1:vreg_512_align2
+    undef %1954.sub0:vreg_64_align2 = COPY %639.sub2:vreg_512_align2
+    %1954.sub1:vreg_64_align2 = COPY %654.sub2:vreg_512_align2
+    %654.sub2:vreg_512_align2 = COPY %639.sub3:vreg_512_align2
+    undef %1917.sub0:vreg_64_align2 = COPY %639.sub4:vreg_512_align2
+    %1917.sub1:vreg_64_align2 = COPY %654.sub4:vreg_512_align2
+    %654.sub4:vreg_512_align2 = COPY %639.sub5:vreg_512_align2
+    %639.sub7:vreg_512_align2 = COPY %654.sub6:vreg_512_align2
+    undef %1857.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %792:vgpr_32, implicit $mode, implicit $exec
+    %1857.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %781:vgpr_32, implicit $mode, implicit $exec
+    %949:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1923:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1970.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %949.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1970.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %949.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %956:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub14_sub15:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1952.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %956.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1952.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %956.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %963:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1860:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1934.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %963.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1934.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %963.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %970:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub0_sub1:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1915.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %970.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1915.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %970.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %977:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1954:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1896.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %977.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1896.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %977.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %984:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub2_sub3:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1875.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %984.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1875.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %984.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %991:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1917:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1986.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %991.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1986.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %991.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %998:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub4_sub5:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1968.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %998.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1968.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %998.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1005:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %639.sub6_sub7:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1950.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1005.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1950.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1005.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1013:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1930:vreg_64_align2, 8, %937:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1014:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1857:vreg_64_align2, 8, %1013:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1015:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1970:vreg_64_align2, 8, %1014:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1016:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1952:vreg_64_align2, 8, %1015:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1017:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1934:vreg_64_align2, 8, %1016:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1018:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1915:vreg_64_align2, 8, %1017:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1019:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1896:vreg_64_align2, 8, %1018:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1020:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1875:vreg_64_align2, 8, %1019:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1021:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1986:vreg_64_align2, 8, %1020:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1022:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1968:vreg_64_align2, 8, %1021:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1023:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1950:vreg_64_align2, 8, %1022:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1025:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1912:vreg_64_align2, 8, %1023:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1026:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1941:vreg_64_align2, 8, %1025:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1027:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1922:vreg_64_align2, 8, %1026:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1028:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1884:vreg_64_align2, 8, %1027:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1029:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1976:vreg_64_align2, 8, %1028:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1030:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1940:vreg_64_align2, 8, %1029:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1031:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1921:vreg_64_align2, 8, %1030:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1032:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1902:vreg_64_align2, 8, %1031:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %114:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1883:vreg_64_align2, 8, %1032:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1971.sub0:vreg_64_align2 = DS_BPERMUTE_B32 %35:vgpr_32, %114.sub0:vreg_64_align2, 0, implicit $exec
+    %1971.sub1:vreg_64_align2 = DS_BPERMUTE_B32 %35:vgpr_32, %114.sub1:vreg_64_align2, 0, implicit $exec
+    %1035:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1979:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1069.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1035.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1893.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1035.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1864.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub0_sub1:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub2_sub3:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub4_sub5:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub6_sub7:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub8_sub9:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub10_sub11:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub12_sub13:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub14_sub15:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub0_sub1:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub2_sub3:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub4_sub5:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub6_sub7:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub8_sub9:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub10_sub11:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub12_sub13:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub14_sub15:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1096:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1918.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1097:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1899.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1098:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1974.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1099:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1956.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1100:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1938.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1101:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1919.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1102:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1900.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1103:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1881.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1104:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1863.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1105:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1973.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1106:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1955.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1107:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1937.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1108:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1930.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1109:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1857.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1110:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1970.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1111:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1952.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1112:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1918.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1113:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1899.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1114:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1974.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1115:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1956.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1116:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1938.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1117:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1919.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1118:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1900.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1119:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1881.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1120:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1863.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1121:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1973.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1122:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1955.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1123:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1937.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1124:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1930.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1125:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1857.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1126:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1970.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1127:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1952.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %1871.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1871.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1983.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1983.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1965.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1965.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1947.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1947.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1927.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1114:vgpr_32, 0, %1115:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1927.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1909.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1118:vgpr_32, 0, %1119:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1909.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1116:vgpr_32, 0, %1117:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1890.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1122:vgpr_32, 0, %1123:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1890.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1120:vgpr_32, 0, %1121:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1867.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1126:vgpr_32, 0, %1127:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1867.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1124:vgpr_32, 0, %1125:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1152:vreg_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4352, 0, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub0_sub1:vreg_128_align2, %1871:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub2_sub3:vreg_128_align2, %1983:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub0_sub1:vreg_128_align2, %1927:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub2_sub3:vreg_128_align2, %1909:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1175:vreg_128_align2 = DS_READ_B128_gfx9 %39:vgpr_32, 4352, 0, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub0_sub1:vreg_128_align2, %1965:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub2_sub3:vreg_128_align2, %1947:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub0_sub1:vreg_128_align2, %1890:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub2_sub3:vreg_128_align2, %1867:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1194:vgpr_32 = V_PERM_B32_e64 %753.sub0:vreg_64_align2, %751.sub0:vreg_64_align2, %731:sreg_32, implicit $exec
+    %1195:vgpr_32 = V_PERM_B32_e64 %753.sub0:vreg_64_align2, %751.sub0:vreg_64_align2, %733:sreg_32, implicit $exec
+    %1198:vgpr_32 = V_PERM_B32_e64 %753.sub1:vreg_64_align2, %751.sub1:vreg_64_align2, %731:sreg_32, implicit $exec
+    %1199:vgpr_32 = V_PERM_B32_e64 %753.sub1:vreg_64_align2, %751.sub1:vreg_64_align2, %733:sreg_32, implicit $exec
+    DS_WRITE_B32_gfx9 %746:vgpr_32, %1194:vgpr_32, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %747:vgpr_32, %1195:vgpr_32, 0, 0, implicit $exec
+    %1200:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %746:vgpr_32, implicit $exec
+    DS_WRITE_B32_gfx9 %1200:vgpr_32, %1198:vgpr_32, 0, 0, implicit $exec
+    %1201:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %746:vgpr_32, implicit $exec
+    DS_WRITE_B32_gfx9 %1201:vgpr_32, %1199:vgpr_32, 0, 0, implicit $exec
+    S_ENDPGM 0
+...



More information about the llvm-commits mailing list