[llvm] [AMDGPU] Introduce IGLP_OPT(2): Generalized exp/mfma interleaving for select kernels (PR #81342)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 9 16:49:40 PST 2024


https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/81342

This is a redesign of https://github.com/llvm/llvm-project/pull/80370 . The implementation is a bit more organized and less repetitive.

This implements the basic pipelining structure of exp/mfma interleaving for better extensibility. While it does have improved extensibility, there are controls which only enable it for DAGs with certain characteristics (matching the DAGs it has been designed against).

It is still a WIP while I do some last minute 

>From eb4594624e505ae7a0d7bca13c4d54e5d5425a0d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 1 Feb 2024 13:57:13 -0800
Subject: [PATCH 1/2] [AMDGPU] Introduce IGLPPhase

Change-Id: I3690e082b98b57392075cac783b853f3fb48b0e5
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 28 +++++++++----------
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       | 10 ++++++-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  6 ++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  6 ++--
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13e..74b62f22aff216 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -852,7 +852,7 @@ class IGLPStrategy {
   virtual void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) = 0;
+      IGLPPhase Phase) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -871,7 +871,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      IGLPPhase Phase) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -884,7 +884,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 void MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    IGLPPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -1101,7 +1101,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      IGLPPhase Phase) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -1118,11 +1118,11 @@ static unsigned DSWWithSharedVMEMCount = 0;
 void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    IGLPPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
+  assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
                         DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
@@ -1133,7 +1133,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore() && !IsReentry) {
+      else if (I->mayStore() && Phase == IGLPPhase::Initial) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1146,7 +1146,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     }
   }
 
-  if (!IsReentry) {
+  if (Phase == IGLPPhase::Initial) {
     DSWWithPermCount = DSWithPerms.size();
     auto I = DSWithPerms.begin();
     auto E = DSWithPerms.end();
@@ -1414,10 +1414,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   bool IsBottomUp = 1;
 
   // Whether or not this is a reentry into the IGroupLPDAGMutation.
-  bool IsReentry = false;
+  IGLPPhase Phase = IGLPPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
+  IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -1717,7 +1717,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (S->shouldApplyStrategy(DAG)) {
     IsBottomUp = S->IsBottomUp;
-    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
+    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
   }
 }
 
@@ -1725,13 +1725,13 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 
 namespace llvm {
 
-/// \p IsReentry specifes whether or not this is a reentry into the
+/// \p Phase specifes whether or not this is a reentry into the
 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
-  return std::make_unique<IGroupLPDAGMutation>(IsReentry);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 3ec8be4f889205..0b72c3dbecce1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,15 @@
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry);
+// Components of the mask that determines which instruction types may be may be
+// classified into a SchedGroup.
+enum class IGLPPhase {
+  Initial = 0u,
+  PreRAReentry = 1u << 0,
+  PostRA = 1u << 1
+};
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b8a7a5e2080213..cfebf72c4f42a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -471,7 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
   return DAG;
 }
 
@@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
+    DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4081115aa68cad..e67a296bcada17 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -855,7 +855,7 @@ bool GCNSchedStage::initGCNRegion() {
     SavedMutations.swap(DAG.Mutations);
     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
                           StageID == GCNSchedStageID::ILPInitialSchedule;
-    DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
+    DAG.addMutation(createIGroupLPDAGMutation(IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
   }
 
   return true;
@@ -1569,7 +1569,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
+    addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();

>From 8c8f6b15ada1b103a495974e1a951d1093ba4162 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 9 Feb 2024 16:04:14 -0800
Subject: [PATCH 2/2] Introduce IGLP_OPT(2): Generalized exp/mfma interleaving
 for select kernels

Change-Id: I1aafb1afea49476718942a8978ee51882a3841b4
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |  895 ++++++-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 2056 +++++++++++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  900 ++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir  |  643 ++++++
 4 files changed, 4475 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 74b62f22aff216..244252cfdeef9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
   // Count of the number of created SchedGroups, used to initialize SGID.
   static unsigned NumSchedGroups;
 
-  const SIInstrInfo *TII;
-
   // Try to add and edge from SU A to SU B.
   bool tryAddEdge(SUnit *A, SUnit *B);
 
@@ -154,6 +152,7 @@ class SchedGroup {
   SmallVector<SUnit *, 32> Collection;
 
   ScheduleDAGInstrs *DAG;
+  const SIInstrInfo *TII;
 
   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 };
@@ -838,6 +837,7 @@ void PipelineSolver::solve() {
 enum IGLPStrategyID : int {
   MFMASmallGemmOptID = 0,
   MFMASmallGemmSingleWaveOptID = 1,
+  MFMAExpInterleave = 2
 };
 
 // Implement a IGLP scheduling strategy.
@@ -904,6 +904,859 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+  // Compute the heuristics for the pipeline, returning whether or not the DAG
+  // is well formatted for the mutation
+  bool analyzeDAG(SmallVectorImpl<SUnit *> &MFMAChainSeeds,
+                  const SIInstrInfo *TII);
+
+  /// Whether or not the instruction is a transitive predecessor of an MFMA
+  /// instruction
+  class IsPipeExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+
+      if (Cache->empty()) {
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++) {
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Cache->push_back(&*I);
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      auto Reaches = (std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+          }));
+
+      return Reaches;
+    }
+    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction enables the exact MFMA that is the \p
+  /// Number th MFMA in the chain starting with \p ChainSeed
+  class EnablesNthMFMA final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      bool FoundTrans = false;
+      unsigned Counter = 1;
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        auto TII = SyncPipe[0].TII;
+        SmallVector<SUnit *, 8> Worklist;
+
+        auto I = DAG->SUnits.begin();
+        auto E = DAG->SUnits.end();
+        for (; I != E; I++) {
+          if (!FoundTrans) {
+            if (TII->isTRANS(I->getInstr()->getOpcode()))
+              FoundTrans = true;
+            continue;
+          } else {
+            if (TII->isMFMAorWMMA(*I->getInstr())) {
+              if (Counter == Number) {
+                Cache->push_back(&*I);
+                break;
+              }
+              ++Counter;
+            }
+          }
+        }
+      }
+      if (Cache->empty())
+        return false;
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the instruction enables  a transitive predecessor of the
+  /// same MFMA instruction as an instruction in a SchedGroup \p Number steps
+  /// before
+  class EnablesNthMFMAInChain final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+    SUnit *ChainSeed;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+
+      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+        return false;
+
+      if (Cache->empty()) {
+        auto TempSU = ChainSeed;
+        auto Depth = Number;
+        while (Depth > 0) {
+          --Depth;
+          bool Found = false;
+          for (auto &Succ : TempSU->Succs) {
+            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+              TempSU = Succ.getSUnit();
+              Found = true;
+              break;
+            }
+          }
+          if (!Found)
+            return false;
+        }
+
+        Cache->push_back(TempSU);
+      }
+      if (Cache->empty())
+        return false;
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
+                          const SIInstrInfo *TII, unsigned SGID,
+                          bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+          ChainSeed(ChainSeed) {}
+  };
+
+  /// Whether or not the instruction has less than \p Size immediate successors.
+  /// If \p HasIntermediary is true, this tests also whether all successors of
+  /// the SUnit have less than \p Size successors.
+  class LessThanNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+    bool HasIntermediary = false;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      auto SuccSize = std::count_if(
+          SU->Succs.begin(), SU->Succs.end(),
+          [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+      if (SuccSize >= Size)
+        return false;
+
+      if (HasIntermediary) {
+        for (auto Succ : SU->Succs) {
+          auto SuccSize = std::count_if(
+              Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+              [](const SDep &SuccSucc) {
+                return SuccSucc.getKind() == SDep::Data;
+              });
+          if (SuccSize >= Size)
+            return false;
+        }
+      }
+
+      return true;
+    }
+    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+                   bool HasIntermediary = false, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+          HasIntermediary(HasIntermediary) {}
+  };
+
+  // Whether or not the instruction is an V_CVT instruction.
+  class IsCvt final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto Opc = SU->getInstr()->getOpcode();
+      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32;
+    }
+    IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is an V_FMA_F32 instruction.
+  class IsFMA final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+             SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
+    }
+    IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction is an immediate RAW successor
+  /// of the SchedGroup \p Distance steps before.
+  class IsSuccOfPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+          OtherGroup = &PipeSG;
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      for (auto &OtherEle : OtherGroup->Collection) {
+        for (auto &Succ : OtherEle->Succs) {
+          if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+            return true;
+        }
+      }
+
+      return false;
+    }
+    IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                         unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is a transitive successor of any
+  /// instruction the the SchedGroup \p Distance steps before.
+  class IsReachableFromPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+          OtherGroup = &PipeSG;
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      for (auto &OtherEle : OtherGroup->Collection)
+        if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+          return true;
+
+      return false;
+    }
+    IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
+  class OccursAfterDSRead final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      return SU->NodeNum >= Number;
+    }
+    OccursAfterDSRead(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+                      bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the SU is exactly the \p Number th MFMA in the chain
+  /// starting with \p ChainSeed
+  class IsExactMFMA final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+    SUnit *ChainSeed;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto TII = SyncPipe[0].TII;
+      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+        return false;
+
+      if (Cache->empty()) {
+        auto TempSU = ChainSeed;
+        auto Depth = Number;
+        while (Depth > 0) {
+          --Depth;
+          bool Found = false;
+          for (auto &Succ : TempSU->Succs) {
+            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+              TempSU = Succ.getSUnit();
+              Found = true;
+              break;
+            }
+          }
+          if (!Found) {
+            return false;
+          }
+        }
+        Cache->push_back(TempSU);
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return (*Cache)[0] == SU;
+    }
+
+    IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
+                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+          ChainSeed(ChainSeed) {}
+  };
+
+  // Whether the instruction occurs after the first TRANS instruction. This
+  // implies the instruction can not be a predecessor of the first TRANS
+  // insruction
+  class OccursAfterExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+            Cache->push_back(&SU);
+            break;
+          }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return SU->NodeNum > (*Cache)[0]->NodeNum;
+    }
+
+    OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+public:
+  void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      IGLPPhase Phase) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 0;
+  }
+};
+
+// The count of TRANS SUs involved in the interleaved pipeline
+static unsigned TransPipeCount = 0;
+// The count of MFMA SUs involved in the interleaved pipeline
+static unsigned MFMAPipeCount = 0;
+// The number of transitive MFMA successors for each TRANS SU
+static unsigned MFMAEnablement = 0;
+// The number of transitive TRANS predecessors for each MFMA SU
+static unsigned ExpRequirement = 0;
+// The count of independent "chains" of MFMA instructions in the pipeline
+static unsigned MFMAChains = 0;
+// The length of each independent "chain" of MFMA instructions
+static unsigned MFMAChainLength = 0;
+// Whether or not the pipeline has V_CVT instructions
+static bool HasCvt = false;
+// Whether or not there are instructions between the TRANS instruction and V_CVT
+static bool HasChainBetweenCvt;
+// The first occuring DS_READ which feeds an MFMA chain
+static std::optional<unsigned> FirstPipeDSR;
+
+bool MFMAExpInterleaveOpt::analyzeDAG(SmallVectorImpl<SUnit *> &MFMAChainSeeds,
+                                      const SIInstrInfo *TII) {
+  SmallVector<SUnit *, 10> ExpPipeCands;
+  SmallVector<SUnit *, 10> MFMAPipeCands;
+  SmallVector<SUnit *, 10> MFMAPipeSUs;
+  SmallVector<SUnit *, 10> PackSUs;
+  SmallVector<SUnit *, 10> CvtSUs;
+
+  auto isBitPack = [](unsigned Opc) {
+    return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
+  };
+
+  auto isCvt = [](unsigned Opc) {
+    return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
+  };
+
+  for (SUnit &SU : DAG->SUnits) {
+    auto Opc = SU.getInstr()->getOpcode();
+    if (TII->isTRANS(Opc)) {
+      // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+      if (SU.Succs.size() >= 7)
+        continue;
+      for (auto &Succ : SU.Succs) {
+        if (Succ.getSUnit()->Succs.size() >= 7)
+          continue;
+      }
+      ExpPipeCands.push_back(&SU);
+    }
+
+    if (TII->isMFMAorWMMA(*SU.getInstr()))
+      MFMAPipeCands.push_back(&SU);
+
+    if (isBitPack(Opc))
+      PackSUs.push_back(&SU);
+
+    if (isCvt(Opc))
+      CvtSUs.push_back(&SU);
+  }
+
+  if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+    return false;
+
+  TransPipeCount = 0;
+
+  std::optional<SUnit *> TempMFMA;
+  std::optional<SUnit *> TempExp;
+  // Count the number of EXPs that reach an MFMA
+  for (auto &PredSU : ExpPipeCands) {
+    for (auto &SuccSU : MFMAPipeCands) {
+      if (DAG->IsReachable(SuccSU, PredSU)) {
+        if (!TempExp.has_value()) {
+          TempExp = PredSU;
+          TempMFMA = SuccSU;
+        }
+        MFMAPipeSUs.push_back(SuccSU);
+        ++TransPipeCount;
+        break;
+      }
+    }
+  }
+
+  if (!TempExp.has_value())
+    return false;
+
+  HasChainBetweenCvt =
+      std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
+                   [&isCvt](SDep &Succ) {
+                     return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
+                   }) == (*TempExp)->Succs.end();
+
+  // Count the number of MFMAs that are reached by an EXP
+  MFMAPipeCount = 0;
+  for (auto &SuccSU : MFMAPipeCands) {
+    if (MFMAPipeSUs.size() &&
+        std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+                     [&SuccSU](SUnit *PotentialMatch) {
+                       return PotentialMatch->NodeNum == SuccSU->NodeNum;
+                     }) != MFMAPipeSUs.end()) {
+      ++MFMAPipeCount;
+      continue;
+    }
+    for (auto &PredSU : ExpPipeCands) {
+      if (DAG->IsReachable(SuccSU, PredSU)) {
+        MFMAPipeSUs.push_back(SuccSU);
+        ++MFMAPipeCount;
+        break;
+      }
+    }
+  }
+
+  if (!TempMFMA.has_value() || !TempExp.has_value())
+    return false;
+
+  std::optional<SUnit *> TempCvt;
+  for (auto &SuccSU : CvtSUs) {
+    if (DAG->IsReachable(SuccSU, *TempExp)) {
+      TempCvt = SuccSU;
+      break;
+    }
+  }
+
+  HasCvt = false;
+  if (TempCvt.has_value()) {
+    for (auto &SuccSU : MFMAPipeSUs) {
+      if (DAG->IsReachable(SuccSU, *TempCvt)) {
+        HasCvt = true;
+        break;
+      }
+    }
+  }
+
+  MFMAChains = 0;
+  for (auto &MFMAPipeSU : MFMAPipeSUs) {
+    if (MFMAChainSeeds.size() &&
+        std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
+            MFMAChainSeeds.end())
+      continue;
+    if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
+                     [&TII](SDep &Succ) {
+                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                     })) {
+      MFMAChainSeeds.push_back(MFMAPipeSU);
+      ++MFMAChains;
+    }
+  }
+
+  if (!MFMAChains)
+    return false;
+
+  for (auto Pred : MFMAChainSeeds[0]->Preds) {
+    if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
+        Pred.getSUnit()->getInstr()->mayLoad())
+      FirstPipeDSR = Pred.getSUnit()->NodeNum;
+  }
+
+  MFMAChainLength = MFMAPipeCount / MFMAChains;
+
+  // The number of bit pack operations that depend on a single V_EXP
+  unsigned PackSuccCount = std::count_if(
+      PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
+        return DAG->IsReachable(VPack, *TempExp);
+      });
+
+  // The number of bit pack operations an MFMA depends on
+  unsigned PackPredCount =
+      std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                    [&isBitPack](SDep &Pred) {
+                      auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                      return isBitPack(Opc);
+                    });
+
+  auto PackPred =
+      std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                   [&isBitPack](SDep &Pred) {
+                     auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                     return isBitPack(Opc);
+                   });
+
+  if (PackPred == (*TempMFMA)->Preds.end())
+    return false;
+
+  MFMAEnablement = 0;
+  ExpRequirement = 0;
+  // How many MFMAs depend on a single bit pack operation
+  MFMAEnablement =
+      std::count_if(PackPred->getSUnit()->Succs.begin(),
+                    PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
+                      return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                    });
+
+  // The number of MFMAs that depend on a single V_EXP
+  MFMAEnablement *= PackSuccCount;
+
+  // The number of V_EXPs required to resolve all dependencies for an MFMA
+  ExpRequirement =
+      std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
+                    [this, &PackPred](SUnit *ExpBase) {
+                      return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
+                    });
+
+  ExpRequirement *= PackPredCount;
+  return true;
+}
+
+void MFMAExpInterleaveOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    IGLPPhase Phase) {
+
+  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  SmallVector<SUnit *, 4> MFMAChainSeeds;
+  if (Phase != IGLPPhase::PostRA && !analyzeDAG(MFMAChainSeeds, TII))
+    return;
+
+  bool IsSmallKernelType =
+      MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
+  bool IsLargeKernelType =
+      MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
+  bool IsTinyKernelType =
+      MFMAEnablement == 1 && ExpRequirement == 4 && TransPipeCount == 32;
+
+  if (!(IsSmallKernelType || IsLargeKernelType || IsTinyKernelType))
+    return;
+
+  unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+
+  unsigned MFMAChain = 0;
+  unsigned PositionInChain = 0;
+  unsigned CurrMFMAForTransPosition = 0;
+
+  auto incrementTransPosition = [&MFMAChain, &PositionInChain,
+                                 &CurrMFMAForTransPosition]() {
+    CurrMFMAForTransPosition += MFMAEnablement;
+    PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
+    MFMAChain = CurrMFMAForTransPosition % MFMAChains;
+  };
+
+  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+    return (TempMFMAForTrans / MFMAChains);
+  };
+
+  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+    return TempMFMAForTrans % MFMAChains;
+  };
+
+  unsigned CurrMFMAPosition = 0;
+  unsigned MFMAChainForMFMA = 0;
+  unsigned PositionInChainForMFMA = 0;
+
+  auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
+                                &PositionInChainForMFMA]() {
+    ++CurrMFMAPosition;
+    MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
+    PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
+  };
+
+  bool IsPostRA = Phase == IGLPPhase::PostRA;
+  assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
+  bool UsesFMA = !IsLargeKernelType || !IsPostRA;
+  bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
+  bool UsesCvt = HasCvt && (!IsLargeKernelType || !IsPostRA);
+
+  // PHASE 1: "Prefetch"
+  if (UsesFMA) {
+    // First Round FMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains) {
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+          true));
+    } else
+      SG->addRule(
+          std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    // Second Round FMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains) {
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          getNextTransPositionInChain(),
+          MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+    } else
+      SG->addRule(
+          std::make_shared<EnablesNthMFMA>(2, TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  if (UsesDSRead) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(
+        std::make_shared<OccursAfterDSRead>(*FirstPipeDSR, TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  // First Round EXP
+  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+      SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
+  if (!IsPostRA && MFMAChains)
+    SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+        PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
+  else
+    SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+  SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+  SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                               HasChainBetweenCvt));
+  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+  incrementTransPosition();
+
+  // First Round CVT, Third Round FMA, Second Round EXP; interleaved
+  for (unsigned I = 0; I < ExpRequirement; I++) {
+    // First Round CVT
+    if (UsesCvt) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+      if (HasChainBetweenCvt)
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+      else
+        SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
+            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // Third Round FMA
+    if (UsesFMA) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      if (!IsPostRA && MFMAChains) {
+        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+            getNextTransPositionInChain(),
+            MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+      } else
+        SG->addRule(
+            std::make_shared<EnablesNthMFMA>(3, TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // Second Round EXP
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains)
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+          true));
+    else
+      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
+                                                   SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                                 HasChainBetweenCvt));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  // PHASE 2: Main Interleave Loop
+
+  // The number of MFMAs per iteration
+  auto MFMARatio =
+      MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
+  // The number of Exps per iteration
+  auto ExpRatio =
+      MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
+  // The reamaining Exps
+  auto RemainingExp = TransPipeCount - (2 * ExpRequirement);
+  auto ExpLoopCount = RemainingExp / ExpRatio;
+  // In loop MFMAs
+  auto MFMAInLoop = MFMAPipeCount - (MFMAEnablement * 2);
+  auto MFMALoopCount = MFMAInLoop / MFMARatio;
+  auto LoopSize = std::min(ExpLoopCount, MFMALoopCount);
+
+  for (unsigned I = 0; I < LoopSize; I++) {
+    if (!(I * ExpRatio % ExpRequirement))
+      incrementTransPosition();
+
+    // Round N MFMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains)
+      SG->addRule(std::make_shared<IsExactMFMA>(
+          PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
+          SG->getSGID(), true));
+    else
+      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    incrementMFMAPosition();
+
+    if (UsesDSRead && !(I % 4)) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<OccursAfterDSRead>(*FirstPipeDSR, TII,
+                                                      SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // CVT, EXP, FMA Interleaving
+    for (unsigned J = 0; J < ExpRatio; J++) {
+      auto MFMAOffset = MFMARatio * (I + 1);
+      auto MaxMFMAOffset = ExpRequirement * MFMARatio / ExpRatio;
+
+      // Round N + 1 CVT
+      if (UsesCvt) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+        auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
+        auto DSROffset = I / 4 + 1;
+        auto MaxDSROffset = MaxMFMAOffset / 4;
+        auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
+                             std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff;
+        if (HasChainBetweenCvt)
+          SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+              CurrentOffset, TII, SG->getSGID()));
+        else
+          SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
+                                                             SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+
+      // Round N + 3 FMA
+      if (UsesFMA) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        if (!IsPostRA && MFMAChains)
+          SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+              getNextTransPositionInChain(),
+              MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
+              true));
+        else
+          SG->addRule(std::make_shared<EnablesNthMFMA>(
+              3 + std::min(MaxMFMAOffset, MFMAOffset), TII, SG->getSGID(),
+              true));
+        SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+
+      // Round N + 2 Exp
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      if (!IsPostRA && MFMAChains)
+        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+            PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+            true));
+      else
+        SG->addRule(std::make_shared<EnablesNthMFMA>(
+            (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
+            TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                                   HasChainBetweenCvt));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+  }
+
+  // PHASE 3: Remaining MFMAs
+  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+      SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
+  SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  return;
+}
+
 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 private:
   // Whether the DS_READ is a predecessor of first four MFMA in region
@@ -1122,8 +1975,10 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
-                        DSWWithSharedVMEMCount == 0)) &&
+  bool IsInitial = Phase == IGLPPhase::Initial;
+
+  assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+                         DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
@@ -1133,7 +1988,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore() && Phase == IGLPPhase::Initial) {
+      else if (I->mayStore() && IsInitial) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1146,7 +2001,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     }
   }
 
-  if (Phase == IGLPPhase::Initial) {
+  if (IsInitial) {
     DSWWithPermCount = DSWithPerms.size();
     auto I = DSWithPerms.begin();
     auto E = DSWithPerms.end();
@@ -1254,14 +2109,14 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         1, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1272,7 +2127,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         3, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1290,7 +2145,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1311,7 +2166,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1325,7 +2180,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1336,7 +2191,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         2, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1347,7 +2202,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         4, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1364,6 +2219,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
   case MFMASmallGemmSingleWaveOptID:
     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+  case MFMAExpInterleave:
+    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -1413,7 +2270,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // first created SchedGroup first.
   bool IsBottomUp = 1;
 
-  // Whether or not this is a reentry into the IGroupLPDAGMutation.
+  // The scheduling phase this application of IGLP corresponds with.
   IGLPPhase Phase = IGLPPhase::Initial;
 
   IGroupLPDAGMutation() = default;
@@ -1591,7 +2448,6 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
     auto &SU = *I;
     if (isFull())
       break;
-
     if (canAddSU(SU))
       SyncedInstrs[&SU].push_back(SGID);
   }
@@ -1730,7 +2586,8 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase) {
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(IGLPPhase Phase) {
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
new file mode 100644
index 00000000000000..001264401e6fbf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -0,0 +1,2056 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @largeInterleave() #0 { ret void }
+  ; GCN-LABEL: largeInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr94
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr106
+  ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr133
+  ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
+  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
+  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
+  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
+  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
+  ; GCN-NEXT:    ; kill: killed $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; kill: killed $vgpr92
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    ; implicit-def: $vgpr73
+  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr74
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
+  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
+  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
+  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
+  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
+  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
+  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
+  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
+  ; GCN-NEXT:    s_nop 5
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
+  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
+  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
+  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
+  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
+  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr68
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
+  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
+  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
+  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v135, v66, v65, 1
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v136, v66, 1, v135
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v137, v66, 1, v136
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
+  ; GCN-NEXT:    v_lshl_add_u32 v138, v66, 1, v137
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v135, v[94:95]
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[98:99]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[102:103]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
+  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v52, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v53, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v54, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v53, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_fma_f32 v71, s4, v55, -v134
+  ; GCN-NEXT:    v_fma_f32 v80, s4, v57, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v51
+  ; GCN-NEXT:    v_exp_f32_e32 v55, v48
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v48, v52
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v161, v57, v48
+  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v71
+  ; GCN-NEXT:    v_fma_f32 v56, s4, v56, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v66
+  ; GCN-NEXT:    v_exp_f32_e32 v57, v57
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v56, 0x3fb8aa3b, v56
+  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v58, v56
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v56, 0x3fb8aa3b, v80
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v56
+  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v56, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
+  ; GCN-NEXT:    v_exp_f32_e32 v164, v56
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v56, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
+  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v56
+  ; GCN-NEXT:    v_mul_f32_e32 v56, 0x3fb8aa3b, v59
+  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v56
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    ; implicit-def: $vgpr56
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v56
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
+  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v56 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
+  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
+  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v151, v33
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v56 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v56 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
+  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
+  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
+  ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
+  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
+  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
+  ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    ; implicit-def: $vgpr38
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
+  ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr36
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN-NEXT:    ; implicit-def: $vgpr37
+  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
+  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
+  ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
+  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v56
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v56 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v56 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v56 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
+  ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v130, v37
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
+  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    v_exp_f32_e32 v144, v22
+  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
+  ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
+  ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
+  ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
+  ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
+  ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v132, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
+  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
+  ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
+  ; GCN-NEXT:    v_exp_f32_e32 v145, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
+  ; GCN-NEXT:    v_exp_f32_e32 v35, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v34, s4, v27, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v132
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v47, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v62
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v34
+  ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v33, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v56
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v36, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v56 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v39, v24
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v34, v1
+  ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v56 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v56 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
+  ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v38, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
+  ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
+  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[4:5]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
+  ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v42, v1
+  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v9, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v29, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v41
+  ; GCN-NEXT:    v_fma_f32 v30, s4, v10, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v42
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
+  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v56
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v56 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
+  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v17, v20
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v3, v0
+  ; GCN-NEXT:    v_add_f32_e32 v3, 0, v49
+  ; GCN-NEXT:    v_add_f32_e32 v3, v50, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v51, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v52, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v53, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v57, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v145, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
+  ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
+  ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
+  ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
+  ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
+  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
+  ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v56 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v56 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            largeInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+    %11:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %443:sgpr_128 = IMPLICIT_DEF
+    %18:sreg_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %391:vreg_128_align2 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %392:vreg_128_align2 = IMPLICIT_DEF
+    %401:vreg_128_align2 = IMPLICIT_DEF
+    %406:vreg_128_align2 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %473:sgpr_128 = IMPLICIT_DEF
+    %411:vreg_128_align2 = IMPLICIT_DEF
+    %416:vreg_128_align2 = IMPLICIT_DEF
+    %421:vreg_128_align2 = IMPLICIT_DEF
+    %426:vreg_128_align2 = IMPLICIT_DEF
+    %1114:sgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %484:sreg_64_xexec = IMPLICIT_DEF
+    %3346:vgpr_32 = IMPLICIT_DEF
+    %1422:sreg_32 = IMPLICIT_DEF
+    %1424:sreg_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %494:sreg_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %50:sreg_32 = IMPLICIT_DEF
+    %3347:vgpr_32 = IMPLICIT_DEF
+    %3329:vgpr_32 = IMPLICIT_DEF
+    %3330:vgpr_32 = IMPLICIT_DEF
+    %3331:vgpr_32 = IMPLICIT_DEF
+    %3332:vgpr_32 = IMPLICIT_DEF
+    %3333:vgpr_32 = IMPLICIT_DEF
+    %2986:vreg_512_align2 = IMPLICIT_DEF
+    %3038:vreg_512_align2 = IMPLICIT_DEF
+    %2980:vreg_512_align2 = IMPLICIT_DEF
+    %3003:vreg_512_align2 = IMPLICIT_DEF
+    %3334:vgpr_32 = IMPLICIT_DEF
+    %3335:vgpr_32 = IMPLICIT_DEF
+    %3336:vgpr_32 = IMPLICIT_DEF
+    %3337:vgpr_32 = IMPLICIT_DEF
+    %3338:vgpr_32 = IMPLICIT_DEF
+    %3339:vgpr_32 = IMPLICIT_DEF
+    %3345:vgpr_32 = IMPLICIT_DEF
+    %3340:vgpr_32 = IMPLICIT_DEF
+    %3341:vgpr_32 = IMPLICIT_DEF
+    %3342:vgpr_32 = IMPLICIT_DEF
+    %3343:vgpr_32 = IMPLICIT_DEF
+    %3344:vgpr_32 = IMPLICIT_DEF
+    %84:vgpr_32 = COPY %3347
+    %86:vgpr_32 = COPY %3347:vgpr_32
+    IGLP_OPT 2
+    %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec
+    %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec
+    %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec
+    %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec
+    %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec
+    %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc
+    %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec
+    %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec
+    %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec
+    %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec
+    %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec
+    %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec
+    %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec
+    %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec
+    %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec
+    %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec
+    %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec
+    %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec
+    %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec
+    %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec
+    %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec
+    %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec
+    %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec
+    %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec
+    %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec
+    %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec
+    %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec
+    %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec
+    %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec
+    %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec
+    %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec
+    %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec
+    %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec
+    %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec
+    %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec
+    %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec
+    %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec
+    %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec
+    %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec
+    %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec
+    %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec
+    %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec
+    %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec
+    %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec
+    %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec
+    %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec
+    %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec
+    %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec
+    %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec
+    %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec
+    %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec
+    %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec
+    %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec
+    %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec
+    %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec
+    %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec
+    %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec
+    %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec
+    %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec
+    %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec
+    %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec
+    %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec
+    %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec
+    %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec
+    %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec
+    %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec
+    %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec
+    %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec
+    %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec
+    %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec
+    %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec
+    %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec
+    %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec
+    %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec
+    %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec
+    %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec
+    %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec
+    %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec
+    %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec
+    %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec
+    %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec
+    %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec
+    %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec
+    %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec
+    %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec
+    %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec
+    %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec
+    %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec
+    %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec
+    %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec
+    %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec
+    %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec
+    %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec
+    %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec
+    %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec
+    %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec
+    %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec
+    %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec
+    %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec
+    %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec
+    %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec
+    %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec
+    %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec
+    %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec
+    %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec
+    %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec
+    %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec
+    %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec
+    %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec
+    %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec
+    %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec
+    %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec
+    %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec
+    %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec
+    %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec
+    %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec
+    %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec
+    %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec
+    %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec
+    %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec
+    %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec
+    %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec
+    %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec
+    %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec
+    %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec
+    %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec
+    %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec
+    %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec
+    %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec
+    %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec
+    %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec
+    %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec
+    %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec
+    %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec
+    %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec
+    %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec
+    %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec
+    %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec
+    %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec
+    %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec
+    %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec
+    %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec
+    %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec
+    %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec
+    %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec
+    %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec
+    %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec
+    %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec
+    %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec
+    %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec
+    %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec
+    %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec
+    %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec
+    %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec
+    %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec
+    %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec
+    %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec
+    %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec
+    %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec
+    %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec
+    %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec
+    %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec
+    %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec
+    %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec
+    %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec
+    %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec
+    %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec
+    %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec
+    %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec
+    %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec
+    %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec
+    %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec
+    %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec
+    %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec
+    %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec
+    %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec
+    %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec
+    %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec
+    %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec
+    %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec
+    %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec
+    %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec
+    %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec
+    %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec
+    %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec
+    %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec
+    %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec
+    %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec
+    %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec
+    %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec
+    %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec
+    %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec
+    %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec
+    %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec
+    %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec
+    %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec
+    %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec
+    %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec
+    %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec
+    %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec
+    %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec
+    %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec
+    %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec
+    %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec
+    %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec
+    %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec
+    %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec
+    %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec
+    %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec
+    %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec
+    %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec
+    %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec
+    %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec
+    %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec
+    %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec
+    %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec
+    %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec
+    %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec
+    %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec
+    %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec
+    %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec
+    %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec
+    %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec
+    undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec
+    %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec
+    %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec
+    %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec
+    %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec
+    %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec
+    %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec
+    %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec
+    %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec
+    %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec
+    %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec
+    %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec
+    %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec
+    %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec
+    %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec
+    %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec
+    %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec
+    %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec
+    %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec
+    %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec
+    %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec
+    %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec
+    %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec
+    %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec
+    %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec
+    %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec
+    %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec
+    %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec
+    %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec
+    %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec
+    %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec
+    %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec
+    %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec
+    %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec
+    %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec
+    %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec
+    %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec
+    %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec
+    %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec
+    %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec
+    %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec
+    %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec
+    %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec
+    %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec
+    %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec
+    %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec
+    %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec
+    %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec
+    %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec
+    %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec
+    %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec
+    %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec
+    %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec
+    %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec
+    %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec
+    %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec
+    %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec
+    %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec
+    %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec
+    %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec
+    %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec
+    %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec
+    %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec
+    %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec
+    %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec
+    %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec
+    %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec
+    %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec
+    %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec
+    %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec
+    %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec
+    undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec
+    %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec
+    %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec
+    %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec
+    %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec
+    %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec
+    %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec
+    %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec
+    %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec
+    %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec
+    %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec
+    %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec
+    %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec
+    %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec
+    %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec
+    %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec
+    %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec
+    %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec
+    %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec
+    %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec
+    %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec
+    %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec
+    %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec
+    %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec
+    %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec
+    %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec
+    %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec
+    %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec
+    %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec
+    %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec
+    %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec
+    %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
new file mode 100644
index 00000000000000..43c135982d525b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @smallInterleave() #0 { ret void }
+  ; GCN-LABEL: smallInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
+  ; GCN-NEXT:    ; implicit-def: $sgpr4
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; implicit-def: $vgpr49
+  ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN-NEXT:    ; implicit-def: $vgpr51
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr77
+  ; GCN-NEXT:    ; implicit-def: $vgpr78
+  ; GCN-NEXT:    ; implicit-def: $vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr80
+  ; GCN-NEXT:    ; implicit-def: $vgpr92
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr5
+  ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
+  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
+  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v49
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v49 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
+  ; GCN-NEXT:    ; kill: killed $vgpr1
+  ; GCN-NEXT:    ; kill: killed $vgpr0
+  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    ; implicit-def: $sgpr3
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v48, v[32:35]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v49
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
+  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
+  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
+  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
+  ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
+  ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
+  ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v27
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v87
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v79, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr84
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
+  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
+  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_fma_f32 v3, s4, v3, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v80
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v28, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v30, s4, v30, -v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    ds_read_b128 v[26:29], v92
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
+  ; GCN-NEXT:    v_perm_b32 v90, v68, v64, s2
+  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
+  ; GCN-NEXT:    v_perm_b32 v91, v69, v65, s2
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v92 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v76, v90
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_fma_f32 v31, s4, v31, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v22
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v82
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v78, v91
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v79, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v87
+  ; GCN-NEXT:    v_pack_b32_f16 v19, v23, v89
+  ; GCN-NEXT:    v_pack_b32_f16 v18, v18, v86
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v23, v65
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v30, 0x3fb8aa3b, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v31
+  ; GCN-NEXT:    v_exp_f32_e32 v20, v20
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v85
+  ; GCN-NEXT:    v_exp_f32_e32 v19, v30
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v88
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v64
+  ; GCN-NEXT:    v_mul_f32_e32 v25, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v30, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v21, v31
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v24
+  ; GCN-NEXT:    v_fma_f32 v4, s4, v4, -v72
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v5, -v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v8, -v72
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v18, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v5, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v29, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v27
+  ; GCN-NEXT:    v_exp_f32_e32 v21, v25
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v30
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v4
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v21
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v25
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[0:1], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v26
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v8
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v28, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v24
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v10, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v10
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v7
+  ; GCN-NEXT:    v_exp_f32_e32 v28, v5
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v29
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v30
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v15, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v0, v31
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v4, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v28
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v13, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v0, 0, v73
+  ; GCN-NEXT:    v_add_f32_e32 v0, v17, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v74, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v75, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v80, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v81, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v82, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v83, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v19, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v20, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v21, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v18, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_add_f32_e32 v0, v29, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v30, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v14, v1
+  ; GCN-NEXT:    v_add_f32_e32 v9, v28, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v92
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v10
+  ; GCN-NEXT:    v_add_f32_e32 v9, v67, v9
+  ; GCN-NEXT:    v_add_f32_e32 v8, v8, v9
+  ; GCN-NEXT:    v_add_f32_e32 v8, v10, v8
+  ; GCN-NEXT:    v_add_f32_e32 v10, v11, v8
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v12
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v14
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v13
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v11
+  ; GCN-NEXT:    v_add_f32_e32 v1, v13, v10
+  ; GCN-NEXT:    v_add_f32_e32 v10, v6, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v15
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v17
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v92 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v14, v10
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+  %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  %1:vgpr_32 = COPY %0:vgpr_32
+  %2:vgpr_32 = IMPLICIT_DEF
+  %3:sreg_32 = IMPLICIT_DEF
+  %4:vreg_64_align2 = IMPLICIT_DEF
+  %5:sgpr_128 = IMPLICIT_DEF
+  %6:vgpr_32 = IMPLICIT_DEF
+  %7:vgpr_32 = IMPLICIT_DEF
+  %8:sgpr_128 = IMPLICIT_DEF
+  %9:vgpr_32 = IMPLICIT_DEF
+  %10:sgpr_512 = IMPLICIT_DEF
+  %11:sgpr_32 = IMPLICIT_DEF
+  %12:sreg_64_xexec = IMPLICIT_DEF
+  %13:vgpr_32 = IMPLICIT_DEF
+  %14:sreg_32 = IMPLICIT_DEF
+  %15:sreg_32 = IMPLICIT_DEF
+  %16:vgpr_32 = IMPLICIT_DEF
+  %17:sreg_32 = IMPLICIT_DEF
+  %18:vgpr_32 = IMPLICIT_DEF
+  %19:vgpr_32 = IMPLICIT_DEF
+  %20:vgpr_32 = IMPLICIT_DEF
+  %21:vgpr_32 = IMPLICIT_DEF
+  %22:vgpr_32 = IMPLICIT_DEF
+  %23:vgpr_32 = IMPLICIT_DEF
+  %24:vgpr_32 = IMPLICIT_DEF
+  %25:vgpr_32 = IMPLICIT_DEF
+  %26:sreg_32 = IMPLICIT_DEF
+  %42:vgpr_32 = IMPLICIT_DEF
+  %44:vreg_128_align2 = IMPLICIT_DEF
+  %48:vgpr_32 = IMPLICIT_DEF
+  %49:vreg_128_align2 = IMPLICIT_DEF
+  %52:vreg_128_align2 = IMPLICIT_DEF
+  %55:vreg_128_align2 = IMPLICIT_DEF
+  %106:vgpr_32 = IMPLICIT_DEF
+  %29:vgpr_32 = IMPLICIT_DEF
+  %37:vgpr_32 = IMPLICIT_DEF
+  %259:vreg_512_align2 = IMPLICIT_DEF
+  %260:vreg_512_align2 = IMPLICIT_DEF
+  IGLP_OPT 2
+  %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec
+  %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec
+  %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec
+  %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc
+  %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec
+  %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec
+  %36:vgpr_32 = V_ADD_U32_e32 %7:vgpr_32, %37:vgpr_32, implicit $exec
+  %38:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %36:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %39:vgpr_32 = V_ADD_U32_e32 %9:vgpr_32, %37:vgpr_32, implicit $exec
+  %40:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %39:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %41:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  early-clobber %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %41.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %41.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %45:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  early-clobber %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %45.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %45.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %47:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %50:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %35:vreg_128_align2, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %51:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %53:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %54:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %56:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %57:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %58:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %59:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %60:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %61:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %62:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %63:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %64:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %65:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %66:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %67:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %68:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %69:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %70:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %71:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %72:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %73:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %74:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %75:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %76:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %77:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %78:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %79:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %80:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %81:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %82:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %83:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %84:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %85:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %86:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %87:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %88:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %89:vgpr_32 = V_MAX3_F32_e64 0, %57:vgpr_32, 0, %11:sgpr_32, 0, %58:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %90:vgpr_32 = V_MAX3_F32_e64 0, %89:vgpr_32, 0, %59:vgpr_32, 0, %60:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %91:vgpr_32 = V_MAX3_F32_e64 0, %90:vgpr_32, 0, %61:vgpr_32, 0, %62:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %92:vgpr_32 = V_MAX3_F32_e64 0, %91:vgpr_32, 0, %63:vgpr_32, 0, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %93:vgpr_32 = V_MAX3_F32_e64 0, %92:vgpr_32, 0, %65:vgpr_32, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %94:vgpr_32 = V_MAX3_F32_e64 0, %93:vgpr_32, 0, %67:vgpr_32, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %95:vgpr_32 = V_MAX3_F32_e64 0, %94:vgpr_32, 0, %69:vgpr_32, 0, %70:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %96:vgpr_32 = V_MAX3_F32_e64 0, %95:vgpr_32, 0, %71:vgpr_32, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %97:vgpr_32 = V_MAX3_F32_e64 0, %96:vgpr_32, 0, %73:vgpr_32, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %98:vgpr_32 = V_MAX3_F32_e64 0, %97:vgpr_32, 0, %75:vgpr_32, 0, %76:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %99:vgpr_32 = V_MAX3_F32_e64 0, %98:vgpr_32, 0, %77:vgpr_32, 0, %78:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %100:vgpr_32 = V_MAX3_F32_e64 0, %99:vgpr_32, 0, %79:vgpr_32, 0, %80:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %101:vgpr_32 = V_MAX3_F32_e64 0, %100:vgpr_32, 0, %81:vgpr_32, 0, %82:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %102:vgpr_32 = V_MAX3_F32_e64 0, %101:vgpr_32, 0, %83:vgpr_32, 0, %84:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %103:vgpr_32 = V_MAX3_F32_e64 0, %102:vgpr_32, 0, %85:vgpr_32, 0, %86:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %104:vgpr_32 = V_MAX3_F32_e64 0, %103:vgpr_32, 0, %87:vgpr_32, 0, %88:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %105:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %104:vgpr_32, 0, implicit $exec
+  %107:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %105:vgpr_32, %105:vgpr_32, implicit $mode, implicit $exec
+  %108:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %104:vgpr_32, %107:vgpr_32, implicit $mode, implicit $exec
+  %109:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %108:vgpr_32, 0, implicit $exec
+  %110:vgpr_32 = V_CNDMASK_B32_e64 0, %109:vgpr_32, 0, %108:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %111:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %110:vgpr_32, %110:vgpr_32, implicit $mode, implicit $exec
+  %112:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %13:vgpr_32, %13:vgpr_32, implicit $mode, implicit $exec
+  %113:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %112:vgpr_32, %111:vgpr_32, implicit $mode, implicit $exec
+  %114:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %115:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %114:vgpr_32, implicit $mode, implicit $exec
+  %116:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %115:vgpr_32, implicit $mode, implicit $exec
+  %117:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %118:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %117:vgpr_32, implicit $mode, implicit $exec
+  %119:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %118:vgpr_32, implicit $mode, implicit $exec
+  %120:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %121:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %120:vgpr_32, implicit $mode, implicit $exec
+  %122:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %121:vgpr_32, implicit $mode, implicit $exec
+  %123:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %124:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %123:vgpr_32, implicit $mode, implicit $exec
+  %125:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %124:vgpr_32, implicit $mode, implicit $exec
+  %126:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %127:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %126:vgpr_32, implicit $mode, implicit $exec
+  %128:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %127:vgpr_32, implicit $mode, implicit $exec
+  %129:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %130:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %129:vgpr_32, implicit $mode, implicit $exec
+  %131:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %130:vgpr_32, implicit $mode, implicit $exec
+  %132:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %133:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %132:vgpr_32, implicit $mode, implicit $exec
+  %134:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %133:vgpr_32, implicit $mode, implicit $exec
+  %135:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %136:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %135:vgpr_32, implicit $mode, implicit $exec
+  %137:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %136:vgpr_32, implicit $mode, implicit $exec
+  %138:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %139:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %138:vgpr_32, implicit $mode, implicit $exec
+  %140:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %139:vgpr_32, implicit $mode, implicit $exec
+  %141:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %142:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %141:vgpr_32, implicit $mode, implicit $exec
+  %143:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %142:vgpr_32, implicit $mode, implicit $exec
+  %144:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %145:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %144:vgpr_32, implicit $mode, implicit $exec
+  %146:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %145:vgpr_32, implicit $mode, implicit $exec
+  %147:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %148:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %147:vgpr_32, implicit $mode, implicit $exec
+  %149:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %148:vgpr_32, implicit $mode, implicit $exec
+  %150:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %151:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %150:vgpr_32, implicit $mode, implicit $exec
+  %152:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %151:vgpr_32, implicit $mode, implicit $exec
+  %153:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %154:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %153:vgpr_32, implicit $mode, implicit $exec
+  %155:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %154:vgpr_32, implicit $mode, implicit $exec
+  %156:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %156:vgpr_32, implicit $mode, implicit $exec
+  %158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %157:vgpr_32, implicit $mode, implicit $exec
+  %159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %159:vgpr_32, implicit $mode, implicit $exec
+  %161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %160:vgpr_32, implicit $mode, implicit $exec
+  %162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %162:vgpr_32, implicit $mode, implicit $exec
+  %164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %163:vgpr_32, implicit $mode, implicit $exec
+  %165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %165:vgpr_32, implicit $mode, implicit $exec
+  %167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %166:vgpr_32, implicit $mode, implicit $exec
+  %168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %168:vgpr_32, implicit $mode, implicit $exec
+  %170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %169:vgpr_32, implicit $mode, implicit $exec
+  %171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %171:vgpr_32, implicit $mode, implicit $exec
+  %173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %172:vgpr_32, implicit $mode, implicit $exec
+  %174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %174:vgpr_32, implicit $mode, implicit $exec
+  %176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %175:vgpr_32, implicit $mode, implicit $exec
+  %177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %177:vgpr_32, implicit $mode, implicit $exec
+  %179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %178:vgpr_32, implicit $mode, implicit $exec
+  %180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %180:vgpr_32, implicit $mode, implicit $exec
+  %182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %181:vgpr_32, implicit $mode, implicit $exec
+  %183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %183:vgpr_32, implicit $mode, implicit $exec
+  %185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %184:vgpr_32, implicit $mode, implicit $exec
+  %186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %186:vgpr_32, implicit $mode, implicit $exec
+  %188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %187:vgpr_32, implicit $mode, implicit $exec
+  %189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %189:vgpr_32, implicit $mode, implicit $exec
+  %191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %190:vgpr_32, implicit $mode, implicit $exec
+  %192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %192:vgpr_32, implicit $mode, implicit $exec
+  %194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %193:vgpr_32, implicit $mode, implicit $exec
+  %195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %195:vgpr_32, implicit $mode, implicit $exec
+  %197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %196:vgpr_32, implicit $mode, implicit $exec
+  %198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %198:vgpr_32, implicit $mode, implicit $exec
+  %200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %199:vgpr_32, implicit $mode, implicit $exec
+  %201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %201:vgpr_32, implicit $mode, implicit $exec
+  %203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %202:vgpr_32, implicit $mode, implicit $exec
+  %204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %204:vgpr_32, implicit $mode, implicit $exec
+  %206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %205:vgpr_32, implicit $mode, implicit $exec
+  %207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %207:vgpr_32, implicit $mode, implicit $exec
+  %209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %208:vgpr_32, implicit $mode, implicit $exec
+  %210:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %116:vgpr_32, implicit $mode, implicit $exec
+  %211:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %119:vgpr_32, %210:vgpr_32, implicit $mode, implicit $exec
+  %212:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %122:vgpr_32, %211:vgpr_32, implicit $mode, implicit $exec
+  %213:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %125:vgpr_32, %212:vgpr_32, implicit $mode, implicit $exec
+  %214:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %128:vgpr_32, %213:vgpr_32, implicit $mode, implicit $exec
+  %215:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %131:vgpr_32, %214:vgpr_32, implicit $mode, implicit $exec
+  %216:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %134:vgpr_32, %215:vgpr_32, implicit $mode, implicit $exec
+  %217:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %137:vgpr_32, %216:vgpr_32, implicit $mode, implicit $exec
+  %218:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %140:vgpr_32, %217:vgpr_32, implicit $mode, implicit $exec
+  %219:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %143:vgpr_32, %218:vgpr_32, implicit $mode, implicit $exec
+  %220:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %146:vgpr_32, %219:vgpr_32, implicit $mode, implicit $exec
+  %221:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %149:vgpr_32, %220:vgpr_32, implicit $mode, implicit $exec
+  %222:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %152:vgpr_32, %221:vgpr_32, implicit $mode, implicit $exec
+  %223:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %155:vgpr_32, %222:vgpr_32, implicit $mode, implicit $exec
+  %224:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %158:vgpr_32, %223:vgpr_32, implicit $mode, implicit $exec
+  %225:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %161:vgpr_32, %224:vgpr_32, implicit $mode, implicit $exec
+  %226:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %164:vgpr_32, %225:vgpr_32, implicit $mode, implicit $exec
+  %227:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %167:vgpr_32, %226:vgpr_32, implicit $mode, implicit $exec
+  %228:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %170:vgpr_32, %227:vgpr_32, implicit $mode, implicit $exec
+  %229:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %173:vgpr_32, %228:vgpr_32, implicit $mode, implicit $exec
+  %230:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %176:vgpr_32, %229:vgpr_32, implicit $mode, implicit $exec
+  %231:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %179:vgpr_32, %230:vgpr_32, implicit $mode, implicit $exec
+  %232:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %182:vgpr_32, %231:vgpr_32, implicit $mode, implicit $exec
+  %233:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %185:vgpr_32, %232:vgpr_32, implicit $mode, implicit $exec
+  %234:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %188:vgpr_32, %233:vgpr_32, implicit $mode, implicit $exec
+  %235:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %191:vgpr_32, %234:vgpr_32, implicit $mode, implicit $exec
+  %236:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %194:vgpr_32, %235:vgpr_32, implicit $mode, implicit $exec
+  %237:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %197:vgpr_32, %236:vgpr_32, implicit $mode, implicit $exec
+  %238:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %200:vgpr_32, %237:vgpr_32, implicit $mode, implicit $exec
+  %239:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %203:vgpr_32, %238:vgpr_32, implicit $mode, implicit $exec
+  %240:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %206:vgpr_32, %239:vgpr_32, implicit $mode, implicit $exec
+  %241:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %209:vgpr_32, %240:vgpr_32, implicit $mode, implicit $exec
+  %242:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %241:vgpr_32, 0, implicit $exec
+  %243:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %241:vgpr_32, %242:vgpr_32, implicit $mode, implicit $exec
+  %244:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %243:vgpr_32, 0, implicit $exec
+  %0:vgpr_32 = V_CNDMASK_B32_e64 0, %244:vgpr_32, 0, %243:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %245:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %13:vgpr_32, %113:vgpr_32, implicit $mode, implicit $exec
+  %246:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %245:vgpr_32, implicit $mode, implicit $exec
+  undef %247.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %246:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %248:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec
+  %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec
+  %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec
+  %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %248:vgpr_32, 0, 0, implicit $exec
+  %256:vgpr_32 = V_LSHL_ADD_U32_e64 %19:vgpr_32, 1, %255:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %249:vgpr_32, 0, 0, implicit $exec
+  %257:vgpr_32 = V_LSHL_ADD_U32_e64 %20:vgpr_32, 1, %256:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %250:vgpr_32, 0, 0, implicit $exec
+  %258:vgpr_32 = V_LSHL_ADD_U32_e64 %21:vgpr_32, 1, %257:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %251:vgpr_32, 0, 0, implicit $exec
+  %0:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %1:vgpr_32, %247.sub0:vreg_64_align2, %0:vgpr_32, implicit $mode, implicit $exec
+  %259.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %261:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %116:vgpr_32, implicit $mode, implicit $exec
+  %262:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %119:vgpr_32, implicit $mode, implicit $exec
+  %263:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %122:vgpr_32, implicit $mode, implicit $exec
+  %264:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %128:vgpr_32, implicit $mode, implicit $exec
+  %265:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %131:vgpr_32, implicit $mode, implicit $exec
+  %266:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %134:vgpr_32, implicit $mode, implicit $exec
+  %267:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %140:vgpr_32, implicit $mode, implicit $exec
+  %268:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %143:vgpr_32, implicit $mode, implicit $exec
+  %269:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %146:vgpr_32, implicit $mode, implicit $exec
+  %270:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %152:vgpr_32, implicit $mode, implicit $exec
+  %271:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %155:vgpr_32, implicit $mode, implicit $exec
+  %272:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %158:vgpr_32, implicit $mode, implicit $exec
+  %273:vgpr_32 = V_ADD_U32_e32 %22:vgpr_32, %37:vgpr_32, implicit $exec
+  %274:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %273:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %275:vgpr_32 = V_ADD_U32_e32 %23:vgpr_32, %37:vgpr_32, implicit $exec
+  %276:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %275:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %277:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %278:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %279:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %280:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %281:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %282:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %283:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %284:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %281:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %282:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %283:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %284:vgpr_32, 0, 0, implicit $exec
+  %285:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %125:vgpr_32, implicit $mode, implicit $exec
+  %286:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %137:vgpr_32, implicit $mode, implicit $exec
+  %287:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %149:vgpr_32, implicit $mode, implicit $exec
+  %288:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %161:vgpr_32, implicit $mode, implicit $exec
+  undef %289.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %263:vgpr_32, 0, %285:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %289.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %261:vgpr_32, 0, %262:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %290.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %266:vgpr_32, 0, %286:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %290.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %264:vgpr_32, 0, %265:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %291.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %269:vgpr_32, 0, %287:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %291.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %267:vgpr_32, 0, %268:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %292.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %272:vgpr_32, 0, %288:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %292.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %270:vgpr_32, 0, %271:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %293:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %209:vgpr_32, implicit $mode, implicit $exec
+  %294:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %203:vgpr_32, implicit $mode, implicit $exec
+  %295:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %197:vgpr_32, implicit $mode, implicit $exec
+  %296:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %191:vgpr_32, implicit $mode, implicit $exec
+  %297:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %185:vgpr_32, implicit $mode, implicit $exec
+  %298:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %179:vgpr_32, implicit $mode, implicit $exec
+  %299:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %173:vgpr_32, implicit $mode, implicit $exec
+  %300:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %167:vgpr_32, implicit $mode, implicit $exec
+  %301:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %206:vgpr_32, implicit $mode, implicit $exec
+  %302:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %200:vgpr_32, implicit $mode, implicit $exec
+  %303:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %194:vgpr_32, implicit $mode, implicit $exec
+  %304:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %188:vgpr_32, implicit $mode, implicit $exec
+  %305:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %182:vgpr_32, implicit $mode, implicit $exec
+  %306:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %176:vgpr_32, implicit $mode, implicit $exec
+  %307:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %170:vgpr_32, implicit $mode, implicit $exec
+  %308:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %164:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  undef %309.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %307:vgpr_32, 0, %299:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %309.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %308:vgpr_32, 0, %300:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %310.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %305:vgpr_32, 0, %297:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %310.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %306:vgpr_32, 0, %298:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %311.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %303:vgpr_32, 0, %295:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %311.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %304:vgpr_32, 0, %296:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %312.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %301:vgpr_32, 0, %293:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %312.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %302:vgpr_32, 0, %294:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %313:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %314:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %315:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %316:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %37:vgpr_32 = V_ADD_U32_e32 %26:sreg_32, %37:vgpr_32, implicit $exec
+  %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec
+  S_ENDPGM 0
+...
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir
new file mode 100644
index 00000000000000..009e753a760b19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir
@@ -0,0 +1,643 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @tinyInterleave() #0 { ret void }
+  ; GCN-LABEL: tinyInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_mov_b32_e32 v106, v41
+  ; GCN-NEXT:    v_mov_b32_e32 v102, v45
+  ; GCN-NEXT:    v_max_f32_e32 v41, v45, v45
+  ; GCN-NEXT:    v_max_f32_e32 v45, v61, v61
+  ; GCN-NEXT:    ; implicit-def: $vgpr100
+  ; GCN-NEXT:    ; implicit-def: $vgpr101
+  ; GCN-NEXT:    v_max_f32_e32 v41, v100, v41
+  ; GCN-NEXT:    v_max_f32_e32 v45, v101, v45
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_max3_f32 v45, v45, v62, v63
+  ; GCN-NEXT:    v_max3_f32 v41, v41, v46, v47
+  ; GCN-NEXT:    ; implicit-def: $vgpr120
+  ; GCN-NEXT:    v_mov_b32_e32 v113, v67
+  ; GCN-NEXT:    v_mov_b32_e32 v107, v57
+  ; GCN-NEXT:    ds_bpermute_b32 v57, v120, v45
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v120, v41
+  ; GCN-NEXT:    ; implicit-def: $vgpr110_vgpr111
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GCN-NEXT:    v_mov_b32_e32 v112, v83
+  ; GCN-NEXT:    v_mov_b32_e32 v103, v61
+  ; GCN-NEXT:    v_mov_b32_e32 v104, v46
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(1)
+  ; GCN-NEXT:    v_max3_f32 v101, v111, v45, v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_max3_f32 v100, v110, v41, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], s[4:5], v[100:101]
+  ; GCN-NEXT:    v_mov_b32_e32 v105, v62
+  ; GCN-NEXT:    v_fma_f32 v41, s5, v80, -v100
+  ; GCN-NEXT:    v_fma_f32 v45, s5, v81, -v100
+  ; GCN-NEXT:    v_fma_f32 v46, s5, v82, -v100
+  ; GCN-NEXT:    v_pk_fma_f32 v[80:81], s[4:5], v[112:113], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_fma_f32 v57, s5, v64, -v101
+  ; GCN-NEXT:    v_fma_f32 v61, s5, v65, -v101
+  ; GCN-NEXT:    v_fma_f32 v62, s5, v66, -v101
+  ; GCN-NEXT:    v_pk_fma_f32 v[82:83], s[4:5], v[110:111], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v110, v41
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v45
+  ; GCN-NEXT:    v_mov_b32_e32 v64, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v46
+  ; GCN-NEXT:    v_mov_b32_e32 v65, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v110
+  ; GCN-NEXT:    v_pk_fma_f32 v[114:115], s[4:5], v[64:65], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_mov_b32_e32 v68, v85
+  ; GCN-NEXT:    v_mov_b32_e32 v64, v86
+  ; GCN-NEXT:    v_mov_b32_e32 v65, v70
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr118
+  ; GCN-NEXT:    v_exp_f32_e32 v111, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v112
+  ; GCN-NEXT:    v_pk_fma_f32 v[68:69], s[4:5], v[68:69], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v113, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v84
+  ; GCN-NEXT:    v_pk_fma_f32 v[116:117], s[4:5], v[64:65], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v118 offset:4352
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mov_b32_e32 v70, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v62
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v82
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v80
+  ; GCN-NEXT:    v_pk_fma_f32 v[70:71], s[4:5], v[70:71], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    v_pack_b32_f16 v118, v41, v45
+  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v119, v57, v61
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v83
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:    v_mov_b32_e32 v86, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[118:119], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v111
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v114
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v113
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v85
+  ; GCN-NEXT:    v_exp_f32_e32 v114, v116
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v81
+  ; GCN-NEXT:    v_exp_f32_e32 v70, v70
+  ; GCN-NEXT:    v_mov_b32_e32 v87, v72
+  ; GCN-NEXT:    v_mov_b32_e32 v72, v89
+  ; GCN-NEXT:    v_mov_b32_e32 v88, v90
+  ; GCN-NEXT:    v_mov_b32_e32 v89, v74
+  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[46:47] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v119, v57, v61
+  ; GCN-NEXT:    v_pack_b32_f16 v118, v41, v45
+  ; GCN-NEXT:    v_pk_fma_f32 v[72:73], s[4:5], v[72:73], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_mov_b32_e32 v74, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[118:119], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v82
+  ; GCN-NEXT:    v_pk_fma_f32 v[64:65], s[4:5], v[86:87], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v115
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v68
+  ; GCN-NEXT:    v_pk_fma_f32 v[86:87], s[4:5], v[88:89], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v69, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v114
+  ; GCN-NEXT:    v_exp_f32_e32 v115, v117
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v70
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v71
+  ; GCN-NEXT:    v_pack_b32_f16 v88, v41, v45
+  ; GCN-NEXT:    v_pk_fma_f32 v[74:75], s[4:5], v[74:75], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pack_b32_f16 v89, v46, v57
+  ; GCN-NEXT:    ; implicit-def: $vgpr121
+  ; GCN-NEXT:    v_mov_b32_e32 v108, v94
+  ; GCN-NEXT:    v_mov_b32_e32 v109, v78
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[88:89], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v83
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v69
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v115
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v71
+  ; GCN-NEXT:    v_mov_b32_e32 v89, v63
+  ; GCN-NEXT:    v_pack_b32_f16 v62, v41, v45
+  ; GCN-NEXT:    v_mov_b32_e32 v88, v47
+  ; GCN-NEXT:    v_pack_b32_f16 v63, v57, v61
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v74
+  ; GCN-NEXT:    v_fma_f32 v45, s5, v92, -v100
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v46
+  ; GCN-NEXT:    v_exp_f32_e32 v47, v65
+  ; GCN-NEXT:    ds_read_b128 v[62:65], v121 offset:4352
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v86
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v87
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v91, v74
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_fma_f32 v61, s5, v93, -v100
+  ; GCN-NEXT:    v_pk_fma_f32 v[66:67], s[4:5], v[108:109], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pack_b32_f16 v91, v90, v91
+  ; GCN-NEXT:    v_pack_b32_f16 v90, v41, v57
+  ; GCN-NEXT:    v_fma_f32 v57, s5, v76, -v101
+  ; GCN-NEXT:    v_mov_b32_e32 v78, v95
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[62:63], v[90:91], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v41, v47
+  ; GCN-NEXT:    v_exp_f32_e32 v76, v45
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v73
+  ; GCN-NEXT:    v_fma_f32 v91, s5, v77, -v101
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v66, v66
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_fma_f32 v39, s5, v39, -v100
+  ; GCN-NEXT:    v_pack_b32_f16 v94, v41, v45
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v39
+  ; GCN-NEXT:    v_pack_b32_f16 v95, v61, v77
+  ; GCN-NEXT:    v_pk_fma_f32 v[78:79], s[4:5], v[78:79], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_fma_f32 v39, s5, v40, -v100
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[62:63], v[94:95], v[0:15]
+  ; GCN-NEXT:    v_mov_b32_e32 v95, v58
+  ; GCN-NEXT:    v_mov_b32_e32 v58, v43
+  ; GCN-NEXT:    v_mov_b32_e32 v94, v42
+  ; GCN-NEXT:    v_pk_fma_f32 v[42:43], s[4:5], v[58:59], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[58:59], s[4:5], v[102:103], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[102:103], v[110:111], v[112:113]
+  ; GCN-NEXT:    v_fma_f32 v41, s5, v55, -v101
+  ; GCN-NEXT:    v_pk_add_f32 v[84:85], v[84:85], v[102:103]
+  ; GCN-NEXT:    v_exp_f32_e32 v40, v78
+  ; GCN-NEXT:    v_pk_add_f32 v[80:81], v[80:81], v[84:85]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v76
+  ; GCN-NEXT:    v_pk_add_f32 v[80:81], v[82:83], v[80:81]
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v41
+  ; GCN-NEXT:    v_pk_add_f32 v[68:69], v[68:69], v[80:81]
+  ; GCN-NEXT:    v_fma_f32 v41, s5, v56, -v101
+  ; GCN-NEXT:    v_pk_add_f32 v[68:69], v[114:115], v[68:69]
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v39
+  ; GCN-NEXT:    v_pk_add_f32 v[68:69], v[70:71], v[68:69]
+  ; GCN-NEXT:    v_mov_b32_e32 v39, v54
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[46:47], v[68:69]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v90
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v57
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[72:73], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v91
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[86:87], v[46:47]
+  ; GCN-NEXT:    v_mov_b32_e32 v68, v32
+  ; GCN-NEXT:    v_mov_b32_e32 v69, v48
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[74:75], v[46:47]
+  ; GCN-NEXT:    v_mov_b32_e32 v48, v33
+  ; GCN-NEXT:    v_mov_b32_e32 v32, v34
+  ; GCN-NEXT:    v_mov_b32_e32 v33, v50
+  ; GCN-NEXT:    v_mov_b32_e32 v50, v35
+  ; GCN-NEXT:    v_mov_b32_e32 v34, v36
+  ; GCN-NEXT:    v_mov_b32_e32 v35, v52
+  ; GCN-NEXT:    v_mov_b32_e32 v52, v37
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v55, v66
+  ; GCN-NEXT:    v_exp_f32_e32 v57, v41
+  ; GCN-NEXT:    v_exp_f32_e32 v41, v79
+  ; GCN-NEXT:    v_pk_fma_f32 v[36:37], s[4:5], v[68:69], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_fma_f32 v[48:49], s[4:5], v[48:49], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_exp_f32_e32 v37, v37
+  ; GCN-NEXT:    v_exp_f32_e32 v36, v36
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[76:77], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v49, v49
+  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_pk_fma_f32 v[32:33], s[4:5], v[32:33], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[90:91], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v33, v33
+  ; GCN-NEXT:    v_exp_f32_e32 v32, v32
+  ; GCN-NEXT:    v_pk_fma_f32 v[50:51], s[4:5], v[50:51], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[66:67], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v51, v51
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v50
+  ; GCN-NEXT:    v_pk_fma_f32 v[34:35], s[4:5], v[34:35], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[46:47], v[40:41], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v35, v35
+  ; GCN-NEXT:    v_exp_f32_e32 v34, v34
+  ; GCN-NEXT:    v_pk_fma_f32 v[52:53], s[4:5], v[52:53], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[36:37], v[36:37], v[46:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v53, v53
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v52
+  ; GCN-NEXT:    v_pk_fma_f32 v[38:39], s[4:5], v[38:39], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[36:37], v[48:49], v[36:37]
+  ; GCN-NEXT:    v_exp_f32_e32 v39, v39
+  ; GCN-NEXT:    v_exp_f32_e32 v38, v38
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[32:33], v[36:37]
+  ; GCN-NEXT:    v_pk_fma_f32 v[62:63], s[4:5], v[106:107], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[50:51], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v63
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[34:35], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v62
+  ; GCN-NEXT:    v_pk_fma_f32 v[94:95], s[4:5], v[94:95], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[52:53], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v95
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v94
+  ; GCN-NEXT:    v_mov_b32_e32 v45, v60
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[38:39], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v43, v43
+  ; GCN-NEXT:    v_exp_f32_e32 v42, v42
+  ; GCN-NEXT:    v_pk_fma_f32 v[44:45], s[4:5], v[44:45], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v40
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[92:93], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v45, v45
+  ; GCN-NEXT:    v_exp_f32_e32 v44, v44
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[56:57], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v59
+  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
+  ; GCN-NEXT:    v_pk_fma_f32 v[60:61], s[4:5], v[104:105], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[62:63], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v61
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v60
+  ; GCN-NEXT:    v_pk_fma_f32 v[88:89], s[4:5], v[88:89], v[100:101] neg_lo:[0,0,1] neg_hi:[0,0,1]
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[94:95], v[32:33]
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[42:43], v[32:33]
+  ; GCN-NEXT:    v_pack_b32_f16 v35, v55, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v34, v78, v54
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[44:45], v[32:33]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v77
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[34:35], v[16:31]
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[58:59], v[32:33]
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr98_vgpr99
+  ; GCN-NEXT:    ; implicit-def: $sgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr122
+  ; GCN-NEXT:    ; implicit-def: $vgpr124
+  ; GCN-NEXT:    ; implicit-def: $vgpr123
+  ; GCN-NEXT:    ; implicit-def: $vgpr125
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[60:61], v[32:33]
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_pk_add_f32 v[32:33], v[88:89], v[32:33]
+  ; GCN-NEXT:    s_nop 5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v41
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v91
+  ; GCN-NEXT:    ds_bpermute_b32 v16, v120, v32
+  ; GCN-NEXT:    ds_bpermute_b32 v17, v120, v33
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v18, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v36, v20
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[16:17], v[0:15]
+  ; GCN-NEXT:    s_nop 7
+  ; GCN-NEXT:    s_nop 2
+  ; GCN-NEXT:    v_perm_b32 v0, v98, v96, s0
+  ; GCN-NEXT:    v_perm_b32 v1, v98, v96, s1
+  ; GCN-NEXT:    v_perm_b32 v2, v99, v97, s0
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v122, v0
+  ; GCN-NEXT:    v_lshl_add_u32 v0, v124, 1, v122
+  ; GCN-NEXT:    v_perm_b32 v3, v99, v97, s1
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v123, v1
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v0, v2
+  ; GCN-NEXT:    v_lshl_add_u32 v0, v125, 1, v122
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v0, v3
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+  !0 = !{i64 2862105}
+
+...
+
+
+---
+name:            tinyInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       3
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+    %712:vgpr_32 = IMPLICIT_DEF
+    %728:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %1979:vreg_64_align2 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    %608:vreg_512_align2 = IMPLICIT_DEF
+    %621:vreg_512_align2 = IMPLICIT_DEF
+    %639:vreg_512_align2 = IMPLICIT_DEF
+    %654:vreg_512_align2 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %753:vreg_64_align2 = IMPLICIT_DEF
+    %751:vreg_64_align2 = IMPLICIT_DEF
+    %731:sreg_32 = IMPLICIT_DEF
+    %733:sreg_32 = IMPLICIT_DEF
+    %753:vreg_64_align2 = IMPLICIT_DEF
+    %746:vgpr_32 = IMPLICIT_DEF
+    %747:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %1864:vreg_512_align2 = IMPLICIT_DEF
+    %1861:vreg_512_align2 = IMPLICIT_DEF
+    IGLP_OPT 2
+    undef %1942.sub0:vreg_64_align2 = COPY %608.sub3:vreg_512_align2
+    %1942.sub1:vreg_64_align2 = COPY %621.sub3:vreg_512_align2
+    undef %1923.sub0:vreg_64_align2 = COPY %608.sub14:vreg_512_align2
+    %1923.sub1:vreg_64_align2 = COPY %621.sub14:vreg_512_align2
+    undef %1904.sub0:vreg_64_align2 = COPY %639.sub9:vreg_512_align2
+    %1904.sub1:vreg_64_align2 = COPY %654.sub9:vreg_512_align2
+    undef %1885.sub0:vreg_64_align2 = COPY %639.sub13:vreg_512_align2
+    %1885.sub1:vreg_64_align2 = COPY %654.sub13:vreg_512_align2
+    %758:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %639.sub13:vreg_512_align2, %639.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %759:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %712:vgpr_32, %758:vgpr_32, implicit $mode, implicit $exec
+    %760:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %654.sub13:vreg_512_align2, %654.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %761:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %728:vgpr_32, %760:vgpr_32, implicit $mode, implicit $exec
+    undef %1856.sub0:vreg_64_align2 = COPY %639.sub14:vreg_512_align2
+    %1856.sub1:vreg_64_align2 = COPY %654.sub14:vreg_512_align2
+    undef %1977.sub0:vreg_64_align2 = COPY %639.sub15:vreg_512_align2
+    %1977.sub1:vreg_64_align2 = COPY %654.sub15:vreg_512_align2
+    %764:vgpr_32 = V_MAX3_F32_e64 0, %761:vgpr_32, 0, %654.sub14:vreg_512_align2, 0, %654.sub15:vreg_512_align2, 0, 0, implicit $mode, implicit $exec
+    %765:vgpr_32 = V_MAX3_F32_e64 0, %759:vgpr_32, 0, %639.sub14:vreg_512_align2, 0, %639.sub15:vreg_512_align2, 0, 0, implicit $mode, implicit $exec
+    %766:vgpr_32 = DS_BPERMUTE_B32 %35:vgpr_32, %765:vgpr_32, 0, implicit $exec
+    %767:vgpr_32 = DS_BPERMUTE_B32 %35:vgpr_32, %764:vgpr_32, 0, implicit $exec
+    undef %1959.sub1:vreg_64_align2 = V_MAX3_F32_e64 0, %1979.sub1:vreg_64_align2, 0, %764:vgpr_32, 0, %767:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1959.sub0:vreg_64_align2 = V_MAX3_F32_e64 0, %1979.sub0:vreg_64_align2, 0, %765:vgpr_32, 0, %766:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %771:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %1.sub4_sub5:sgpr_512, 8, %1959:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %774:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub0:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1918.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %774:vgpr_32, implicit $mode, implicit $exec
+    %776:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub1:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1899.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %776:vgpr_32, implicit $mode, implicit $exec
+    %778:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub2:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %779:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub12:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1930.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %779:vgpr_32, implicit $mode, implicit $exec
+    %781:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %608.sub13:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %782:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %639.sub7:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1912.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %782:vgpr_32, implicit $mode, implicit $exec
+    %783:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %639.sub8:vreg_512_align2, 1, %771.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %785:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub0:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1918.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %785:vgpr_32, implicit $mode, implicit $exec
+    %787:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub1:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1899.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %787:vgpr_32, implicit $mode, implicit $exec
+    %789:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub2:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %790:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub12:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1930.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %790:vgpr_32, implicit $mode, implicit $exec
+    %792:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %621.sub13:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %793:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %654.sub7:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %1912.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %793:vgpr_32, implicit $mode, implicit $exec
+    %794:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub5:sgpr_512, 0, %654.sub8:vreg_512_align2, 1, %771.sub1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    undef %1941.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %794:vgpr_32, implicit $mode, implicit $exec
+    %1941.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %783:vgpr_32, implicit $mode, implicit $exec
+    %798:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1904:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1922.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %798.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1922.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %798.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1903.sub0:vreg_64_align2 = COPY %639.sub10:vreg_512_align2
+    %1903.sub1:vreg_64_align2 = COPY %654.sub10:vreg_512_align2
+    %806:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1903:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1884.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %806.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1884.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %806.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %654.sub10:vreg_512_align2 = COPY %639.sub11:vreg_512_align2
+    %814:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub10_sub11:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1976.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %814.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1976.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %814.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %639.sub13:vreg_512_align2 = COPY %654.sub12:vreg_512_align2
+    %822:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %639.sub12_sub13:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1940.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %822.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1940.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %822.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %829:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1885:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1921.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %829.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1921.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %829.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %836:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1856:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1902.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %836.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1902.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %836.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %843:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1977:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1883.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %843.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1883.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %843.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1870.sub0:vreg_64_align2 = COPY %608.sub4:vreg_512_align2
+    %1870.sub1:vreg_64_align2 = COPY %621.sub4:vreg_512_align2
+    %621.sub4:vreg_512_align2 = COPY %608.sub5:vreg_512_align2
+    undef %1957.sub0:vreg_64_align2 = COPY %608.sub6:vreg_512_align2
+    %1957.sub1:vreg_64_align2 = COPY %621.sub6:vreg_512_align2
+    %621.sub6:vreg_512_align2 = COPY %608.sub7:vreg_512_align2
+    undef %1920.sub0:vreg_64_align2 = COPY %608.sub8:vreg_512_align2
+    %1920.sub1:vreg_64_align2 = COPY %621.sub8:vreg_512_align2
+    %621.sub8:vreg_512_align2 = COPY %608.sub9:vreg_512_align2
+    undef %1882.sub0:vreg_64_align2 = COPY %608.sub10:vreg_512_align2
+    %1882.sub1:vreg_64_align2 = COPY %621.sub10:vreg_512_align2
+    %621.sub10:vreg_512_align2 = COPY %608.sub11:vreg_512_align2
+    undef %1974.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %789:vgpr_32, implicit $mode, implicit $exec
+    %1974.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %778:vgpr_32, implicit $mode, implicit $exec
+    %861:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1942:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1956.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %861.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1956.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %861.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %868:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1870:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1938.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %868.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1938.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %868.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %875:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub4_sub5:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1919.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %875.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1919.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %875.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %882:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1957:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1900.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %882.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1900.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %882.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %889:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub6_sub7:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1881.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %889.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1881.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %889.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %896:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1920:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1863.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %896.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1863.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %896.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %903:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub8_sub9:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1973.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %903.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1973.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %903.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %910:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1882:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1955.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %910.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1955.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %910.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %917:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub10_sub11:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1937.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %917.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1937.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %917.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %926:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1918:vreg_64_align2, 8, %1899:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %928:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1974:vreg_64_align2, 8, %926:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %929:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1956:vreg_64_align2, 8, %928:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %930:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1938:vreg_64_align2, 8, %929:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1919:vreg_64_align2, 8, %930:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %932:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1900:vreg_64_align2, 8, %931:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %933:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1881:vreg_64_align2, 8, %932:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %934:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1863:vreg_64_align2, 8, %933:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %935:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1973:vreg_64_align2, 8, %934:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %936:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1955:vreg_64_align2, 8, %935:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %937:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1937:vreg_64_align2, 8, %936:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %621.sub14:vreg_512_align2 = COPY %608.sub15:vreg_512_align2
+    undef %1860.sub0:vreg_64_align2 = COPY %639.sub0:vreg_512_align2
+    %1860.sub1:vreg_64_align2 = COPY %654.sub0:vreg_512_align2
+    %654.sub0:vreg_512_align2 = COPY %639.sub1:vreg_512_align2
+    undef %1954.sub0:vreg_64_align2 = COPY %639.sub2:vreg_512_align2
+    %1954.sub1:vreg_64_align2 = COPY %654.sub2:vreg_512_align2
+    %654.sub2:vreg_512_align2 = COPY %639.sub3:vreg_512_align2
+    undef %1917.sub0:vreg_64_align2 = COPY %639.sub4:vreg_512_align2
+    %1917.sub1:vreg_64_align2 = COPY %654.sub4:vreg_512_align2
+    %654.sub4:vreg_512_align2 = COPY %639.sub5:vreg_512_align2
+    %639.sub7:vreg_512_align2 = COPY %654.sub6:vreg_512_align2
+    undef %1857.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %792:vgpr_32, implicit $mode, implicit $exec
+    %1857.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %781:vgpr_32, implicit $mode, implicit $exec
+    %949:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1923:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1970.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %949.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1970.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %949.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %956:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %621.sub14_sub15:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1952.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %956.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1952.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %956.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %963:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1860:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1934.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %963.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1934.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %963.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %970:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub0_sub1:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1915.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %970.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1915.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %970.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %977:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1954:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1896.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %977.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1896.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %977.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %984:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub2_sub3:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1875.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %984.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1875.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %984.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %991:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1917:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1986.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %991.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1986.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %991.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %998:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %654.sub4_sub5:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1968.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %998.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1968.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %998.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1005:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %639.sub6_sub7:vreg_512_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1950.sub1:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1005.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1950.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1005.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1013:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1930:vreg_64_align2, 8, %937:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1014:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1857:vreg_64_align2, 8, %1013:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1015:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1970:vreg_64_align2, 8, %1014:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1016:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1952:vreg_64_align2, 8, %1015:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1017:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1934:vreg_64_align2, 8, %1016:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1018:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1915:vreg_64_align2, 8, %1017:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1019:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1896:vreg_64_align2, 8, %1018:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1020:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1875:vreg_64_align2, 8, %1019:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1021:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1986:vreg_64_align2, 8, %1020:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1022:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1968:vreg_64_align2, 8, %1021:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1023:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1950:vreg_64_align2, 8, %1022:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1025:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1912:vreg_64_align2, 8, %1023:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1026:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1941:vreg_64_align2, 8, %1025:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1027:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1922:vreg_64_align2, 8, %1026:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1028:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1884:vreg_64_align2, 8, %1027:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1029:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1976:vreg_64_align2, 8, %1028:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1030:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1940:vreg_64_align2, 8, %1029:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1031:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1921:vreg_64_align2, 8, %1030:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1032:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1902:vreg_64_align2, 8, %1031:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %114:vreg_64_align2 = contract nofpexcept V_PK_ADD_F32 8, %1883:vreg_64_align2, 8, %1032:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1971.sub0:vreg_64_align2 = DS_BPERMUTE_B32 %35:vgpr_32, %114.sub0:vreg_64_align2, 0, implicit $exec
+    %1971.sub1:vreg_64_align2 = DS_BPERMUTE_B32 %35:vgpr_32, %114.sub1:vreg_64_align2, 0, implicit $exec
+    %1035:vreg_64_align2 = contract nofpexcept V_PK_FMA_F32 8, %1.sub4_sub5:sgpr_512, 8, %1979:vreg_64_align2, 11, %771:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    undef %1069.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1035.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    undef %1893.sub0:vreg_64_align2 = nofpexcept V_EXP_F32_e32 %1035.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1864.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub0_sub1:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub2_sub3:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub4_sub5:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub6_sub7:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub8_sub9:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub10_sub11:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub12_sub13:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1864.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1864.sub14_sub15:vreg_512_align2, 0, %1893:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub0_sub1:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub2_sub3:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub4_sub5:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub6_sub7:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub8_sub9:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub10_sub11:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub12_sub13:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1861.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %1861.sub14_sub15:vreg_512_align2, 0, %1069:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1096:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1918.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1097:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1899.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1098:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1974.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1099:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1956.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1100:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1938.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1101:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1919.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1102:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1900.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1103:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1881.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1104:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1863.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1105:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1973.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1106:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1955.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1107:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1937.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1108:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1930.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1109:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1857.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1110:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1970.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1111:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1952.sub0:vreg_64_align2, implicit $mode, implicit $exec
+    %1112:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1918.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1113:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1899.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1114:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1974.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1115:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1956.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1116:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1938.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1117:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1919.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1118:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1900.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1119:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1881.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1120:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1863.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1121:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1973.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1122:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1955.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1123:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1937.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1124:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1930.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1125:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1857.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1126:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1970.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    %1127:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1952.sub1:vreg_64_align2, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %1871.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1871.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1983.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1983.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1965.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1965.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1947.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1947.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1927.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1114:vgpr_32, 0, %1115:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1927.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1909.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1118:vgpr_32, 0, %1119:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1909.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1116:vgpr_32, 0, %1117:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1890.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1122:vgpr_32, 0, %1123:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1890.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1120:vgpr_32, 0, %1121:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %1867.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1126:vgpr_32, 0, %1127:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1867.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1124:vgpr_32, 0, %1125:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1152:vreg_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4352, 0, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub0_sub1:vreg_128_align2, %1871:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub2_sub3:vreg_128_align2, %1983:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub0_sub1:vreg_128_align2, %1927:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1152.sub2_sub3:vreg_128_align2, %1909:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1175:vreg_128_align2 = DS_READ_B128_gfx9 %39:vgpr_32, 4352, 0, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub0_sub1:vreg_128_align2, %1965:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1864:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub2_sub3:vreg_128_align2, %1947:vreg_64_align2, %1864:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub0_sub1:vreg_128_align2, %1890:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1861:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1175.sub2_sub3:vreg_128_align2, %1867:vreg_64_align2, %1861:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1194:vgpr_32 = V_PERM_B32_e64 %753.sub0:vreg_64_align2, %751.sub0:vreg_64_align2, %731:sreg_32, implicit $exec
+    %1195:vgpr_32 = V_PERM_B32_e64 %753.sub0:vreg_64_align2, %751.sub0:vreg_64_align2, %733:sreg_32, implicit $exec
+    %1198:vgpr_32 = V_PERM_B32_e64 %753.sub1:vreg_64_align2, %751.sub1:vreg_64_align2, %731:sreg_32, implicit $exec
+    %1199:vgpr_32 = V_PERM_B32_e64 %753.sub1:vreg_64_align2, %751.sub1:vreg_64_align2, %733:sreg_32, implicit $exec
+    DS_WRITE_B32_gfx9 %746:vgpr_32, %1194:vgpr_32, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %747:vgpr_32, %1195:vgpr_32, 0, 0, implicit $exec
+    %1200:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %746:vgpr_32, implicit $exec
+    DS_WRITE_B32_gfx9 %1200:vgpr_32, %1198:vgpr_32, 0, 0, implicit $exec
+    %1201:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %746:vgpr_32, implicit $exec
+    DS_WRITE_B32_gfx9 %1201:vgpr_32, %1199:vgpr_32, 0, 0, implicit $exec
+    S_ENDPGM 0
+...



More information about the llvm-commits mailing list