[llvm] [AMDGPU] Serialize disjoint MFMA chains to hide DS_READ latency (PR #170242)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 1 20:54:03 PST 2025


https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/170242

>From dc81a331b4bedcf62b92b6f3c49f78ee0edae4ec Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Mon, 17 Nov 2025 21:27:54 -0600
Subject: [PATCH] [AMDGPU] Serialize disjoint MFMA chains to hide DS_READ
 latency

This patch identifies disjoint chains of dependent
MFMA instructions (with length >= 2) and stitches
them together into a single execution sequence by
adding artificial dependencies from the tail of
one chain to the head of the next.

Currently, the scheduler may schedule disjoint
MFMA chains too early or interleave them, which
can expose high latencies from their associated
DS_READ operands. By strictly serializing these
MFMA chains, we force subsequent chains to execute
later. This artificial delay increases the distance
between the DS_READ issuance and the consuming MFMA
instruction, effectively hiding the load latency.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |   99 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1507 +++++++++--------
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  735 ++++----
 .../misched-ds-mfma-order-false-deps.mir      |  118 ++
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |   83 +-
 5 files changed, 1382 insertions(+), 1160 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 85addb13aef8d..0252c4c4b08e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -21,6 +21,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 
@@ -60,6 +61,10 @@ static cl::opt<bool> UseCostHeur(
              "Experimentally, results are mixed, so this should be set on a "
              "case-by-case basis."));
 
+static cl::opt<bool> DisableMfmaChainOrderingDeps(
+    "amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden,
+    cl::desc("Enable artificial false dependencies between MFMA chains"));
+
 // Components of the mask that determines which instruction types may be may be
 // classified into a SchedGroup.
 enum class SchedGroupMask {
@@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // Add DAG edges that enforce SCHED_BARRIER ordering.
   void addSchedBarrierEdges(SUnit &SU);
 
+  // Add artificial false-dependencies between MFMA consumers of adjacent
+  // DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering.
+  void addMfmaFalseDeps();
+
   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
   // not be reordered accross the SCHED_BARRIER. This is used for the base
   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
@@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     }
   }
 
+  if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts())
+    addMfmaFalseDeps();
+
   if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
     // PipelineSolver performs the mutation by adding the edges it
@@ -2681,12 +2693,97 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 
 } // namespace
 
+void IGroupLPDAGMutation::addMfmaFalseDeps() {
+  DenseMap<SUnit *, SUnit *> MFMAAncestor;
+  SmallVector<SUnit *, 10> MFMAChainLeaders;
+  DenseMap<SUnit *, SUnit *> MFMAChainNext;
+  for (auto &SU : DAG->SUnits) {
+    if (!TII->isMFMAorWMMA(*SU.getInstr()))
+      continue;
+
+    if (MFMAAncestor.contains(&SU))
+      continue;
+
+    SUnit *CurrMFMA = &SU;
+    MFMAAncestor[CurrMFMA] = CurrMFMA;
+    MFMAChainLeaders.push_back(&SU);
+    while (!CurrMFMA->Succs.empty()) {
+      // Count the number of successor MFMA/WMMA instructions of
+      // the current MFMA instruction.
+      SUnit *NextMFMA = nullptr;
+      unsigned MFMADataDepSuccCount = 0;
+      for (const auto &Succ : CurrMFMA->Succs) {
+        SUnit *SuccSU = Succ.getSUnit();
+        if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr()))
+          continue;
+
+        // Check if the successor is MFMA/WMMA and the edge is a data dependency
+        if (Succ.getKind() == SDep::Data) {
+          NextMFMA = SuccSU;
+          MFMADataDepSuccCount++;
+        }
+      }
+
+      // If the current MFMA instruction has more than one successor MFMA/WMMA
+      // instruction, we need to break the chain.
+      if (MFMADataDepSuccCount != 1) {
+        MFMAChainNext[CurrMFMA] = nullptr;
+        break;
+      }
+
+      // Add the current MFMA instruction to the MFMAAncestor map.
+      MFMAAncestor[CurrMFMA] = &SU;
+      MFMAChainNext[CurrMFMA] = NextMFMA;
+      CurrMFMA = NextMFMA;
+    }
+  }
+
+  // Compute the tail and length of each chain in a single loop.
+  auto GetTailAndLength = [&](SUnit *Leader) -> std::pair<SUnit *, unsigned> {
+    unsigned Length = 1;
+    SUnit *Curr = Leader;
+    while (MFMAChainNext.count(Curr)) {
+      if (!MFMAChainNext[Curr])
+        break;
+      Curr = MFMAChainNext[Curr];
+      ++Length;
+    }
+    return {Curr, Length};
+  };
+
+  // Assert that all MFMA chains are ordered by NodeNum
+  // Add artificial false dependencies between MFMA chains if two given
+  // chains are at least 2 SUs long.
+  // Iterate over all pairs of contiguous MFMA chains and add artificial edges
+  // if chains are at least 2 SUs long.
+  for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) {
+    SUnit *ChainLeaderA = MFMAChainLeaders[I];
+    SUnit *ChainLeaderB = MFMAChainLeaders[I + 1];
+
+    auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA);
+    auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB);
+
+    // Only add if both chains are at least two SUs long.
+    if (LengthA >= 2 && LengthB >= 2) {
+      // Add an artificial dependency edge from the tail of chain A to the
+      // leader of chain B.
+      LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from "
+                        << TailA->NodeNum << " to " << ChainLeaderB->NodeNum
+                        << "\n");
+      DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial));
+    }
+  }
+}
+
+namespace llvm {
+
 /// \p Phase specifes whether or not this is a reentry into the
 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
 std::unique_ptr<ScheduleDAGMutation>
-llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
+} // namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 689d1472d6010..a2a00d107a7bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,143 +6,143 @@
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
   ; GCN-LABEL: largeInterleave:
   ; GCN:       ; %bb.0:
-  ; GCN-NEXT:    ; implicit-def: $vgpr16
-  ; GCN-NEXT:    ; implicit-def: $vgpr25
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-  ; GCN-NEXT:    v_readfirstlane_b32 s17, v16
+  ; GCN-NEXT:    v_readfirstlane_b32 s17, v0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    s_lshl_b32 s18, s17, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    v_add_lshl_u32 v231, v2, s18, 1
+  ; GCN-NEXT:    v_add_u32_e32 v2, s17, v4
+  ; GCN-NEXT:    v_and_b32_e32 v2, 0x1fffffff, v2
+  ; GCN-NEXT:    ; implicit-def: $sgpr16
+  ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s16
+  ; GCN-NEXT:    ; implicit-def: $vgpr5
+  ; GCN-NEXT:    ; implicit-def: $vgpr9
+  ; GCN-NEXT:    v_add_lshl_u32 v172, v5, v2, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s17, 4, v9
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s6
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    v_add_lshl_u32 v224, v2, v1, 1
   ; GCN-NEXT:    ; implicit-def: $sgpr15
+  ; GCN-NEXT:    v_add_u32_e32 v1, s15, v224
   ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN-NEXT:    s_lshl_b32 s18, s17, 7
-  ; GCN-NEXT:    ; implicit-def: $vgpr18
-  ; GCN-NEXT:    v_add_lshl_u32 v230, v18, s18, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v25, s17, 4, v25
-  ; GCN-NEXT:    v_mul_lo_u32 v25, v25, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v226, v25, v17, 1
-  ; GCN-NEXT:    v_add_u32_e32 v17, s15, v226
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v224, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v1, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, 64, v17
-  ; GCN-NEXT:    ; implicit-def: $vgpr213
+  ; GCN-NEXT:    v_add_u32_e32 v72, 64, v1
+  ; GCN-NEXT:    ; implicit-def: $vgpr230
   ; GCN-NEXT:    ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
-  ; GCN-NEXT:    ; implicit-def: $vgpr246
-  ; GCN-NEXT:    v_add_u32_e32 v188, 0x80, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr240
+  ; GCN-NEXT:    v_add_u32_e32 v208, 0x80, v1
   ; GCN-NEXT:    ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
   ; GCN-NEXT:    ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
-  ; GCN-NEXT:    ; implicit-def: $vgpr19
-  ; GCN-NEXT:    ; implicit-def: $vgpr26
-  ; GCN-NEXT:    ; implicit-def: $vgpr27
-  ; GCN-NEXT:    v_add_u32_e32 v227, 0xc0, v17
-  ; GCN-NEXT:    v_add_u32_e32 v231, v19, v26
-  ; GCN-NEXT:    v_add_u32_e32 v232, v19, v27
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr28
-  ; GCN-NEXT:    ; implicit-def: $vgpr29
-  ; GCN-NEXT:    v_add_u32_e32 v233, v19, v28
-  ; GCN-NEXT:    v_add_u32_e32 v234, v19, v29
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr10
+  ; GCN-NEXT:    ; implicit-def: $vgpr11
+  ; GCN-NEXT:    ; implicit-def: $vgpr12
+  ; GCN-NEXT:    ; implicit-def: $vgpr13
+  ; GCN-NEXT:    v_add_u32_e32 v225, 0xc0, v1
+  ; GCN-NEXT:    v_add_u32_e32 v226, v3, v10
+  ; GCN-NEXT:    v_add_u32_e32 v227, v3, v11
+  ; GCN-NEXT:    v_add_u32_e32 v228, v3, v12
+  ; GCN-NEXT:    v_add_u32_e32 v232, v3, v13
   ; GCN-NEXT:    ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    ; implicit-def: $sgpr7
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
   ; GCN-NEXT:    ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
   ; GCN-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    ; implicit-def: $sgpr7
   ; GCN-NEXT:    ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    v_add_u32_e32 v18, s17, v20
-  ; GCN-NEXT:    v_and_b32_e32 v18, 0x1fffffff, v18
-  ; GCN-NEXT:    ; implicit-def: $sgpr16
-  ; GCN-NEXT:    v_mul_lo_u32 v18, v18, s16
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    v_add_lshl_u32 v199, v21, v18, 1
-  ; GCN-NEXT:    ; implicit-def: $vgpr22
-  ; GCN-NEXT:    v_lshl_add_u32 v200, v22, 1, v199
-  ; GCN-NEXT:    ; implicit-def: $vgpr23
-  ; GCN-NEXT:    v_lshl_add_u32 v201, v23, 1, v200
-  ; GCN-NEXT:    ; implicit-def: $vgpr24
-  ; GCN-NEXT:    v_lshl_add_u32 v202, v24, 1, v201
-  ; GCN-NEXT:    ; implicit-def: $vgpr16
-  ; GCN-NEXT:    ; implicit-def: $vgpr18
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    ; implicit-def: $vgpr24
-  ; GCN-NEXT:    v_add_u32_e32 v247, v19, v24
-  ; GCN-NEXT:    v_add_u32_e32 v248, v19, v16
-  ; GCN-NEXT:    v_add_u32_e32 v249, v19, v18
-  ; GCN-NEXT:    v_add_u32_e32 v250, v19, v20
   ; GCN-NEXT:    ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
   ; GCN-NEXT:    ; implicit-def: $sgpr14
-  ; GCN-NEXT:    ; implicit-def: $vgpr196
+  ; GCN-NEXT:    ; implicit-def: $vgpr6
+  ; GCN-NEXT:    v_lshl_add_u32 v173, v6, 1, v172
+  ; GCN-NEXT:    ; implicit-def: $vgpr7
+  ; GCN-NEXT:    v_lshl_add_u32 v174, v7, 1, v173
+  ; GCN-NEXT:    ; implicit-def: $vgpr8
+  ; GCN-NEXT:    v_lshl_add_u32 v175, v8, 1, v174
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    ; implicit-def: $vgpr170
+  ; GCN-NEXT:    v_add_u32_e32 v242, v3, v170
+  ; GCN-NEXT:    v_add_u32_e32 v243, v3, v0
+  ; GCN-NEXT:    v_add_u32_e32 v244, v3, v2
+  ; GCN-NEXT:    v_add_u32_e32 v245, v3, v4
+  ; GCN-NEXT:    ; implicit-def: $vgpr160
   ; GCN-NEXT:    ; implicit-def: $sgpr12_sgpr13
-  ; GCN-NEXT:    ; implicit-def: $vgpr211
-  ; GCN-NEXT:    v_max_f32_e32 v212, v211, v211
-  ; GCN-NEXT:    ; implicit-def: $vgpr198
-  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr32
-  ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    ; implicit-def: $vgpr34
-  ; GCN-NEXT:    v_add_u32_e32 v210, v19, v34
-  ; GCN-NEXT:    v_add_u32_e32 v206, v19, v33
-  ; GCN-NEXT:    v_add_u32_e32 v205, v19, v32
-  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    ; implicit-def: $vgpr22
-  ; GCN-NEXT:    ; implicit-def: $vgpr23
-  ; GCN-NEXT:    ; implicit-def: $vgpr30
-  ; GCN-NEXT:    ; implicit-def: $vgpr31
-  ; GCN-NEXT:    v_add_u32_e32 v207, v19, v21
-  ; GCN-NEXT:    v_add_u32_e32 v208, v19, v22
-  ; GCN-NEXT:    v_add_u32_e32 v209, v19, v23
-  ; GCN-NEXT:    v_add_u32_e32 v203, v19, v30
-  ; GCN-NEXT:    v_add_u32_e32 v204, v19, v31
-  ; GCN-NEXT:    ; kill: killed $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr241
+  ; GCN-NEXT:    v_max_f32_e32 v246, v241, v241
+  ; GCN-NEXT:    ; implicit-def: $vgpr171
   ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    ; implicit-def: $vgpr161
+  ; GCN-NEXT:    ; implicit-def: $vgpr162
+  ; GCN-NEXT:    ; implicit-def: $vgpr163
+  ; GCN-NEXT:    ; implicit-def: $vgpr165
+  ; GCN-NEXT:    v_add_u32_e32 v247, v3, v165
+  ; GCN-NEXT:    v_add_u32_e32 v248, v3, v161
+  ; GCN-NEXT:    v_add_u32_e32 v249, v3, v162
+  ; GCN-NEXT:    v_add_u32_e32 v250, v3, v163
+  ; GCN-NEXT:    ; implicit-def: $vgpr164
+  ; GCN-NEXT:    ; implicit-def: $vgpr166
+  ; GCN-NEXT:    ; implicit-def: $vgpr167
+  ; GCN-NEXT:    ; implicit-def: $vgpr168
+  ; GCN-NEXT:    v_add_u32_e32 v176, v3, v168
+  ; GCN-NEXT:    v_add_u32_e32 v177, v3, v164
+  ; GCN-NEXT:    v_add_u32_e32 v178, v3, v166
+  ; GCN-NEXT:    v_add_u32_e32 v179, v3, v167
+  ; GCN-NEXT:    ; kill: killed $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-  ; GCN-NEXT:    ; implicit-def: $vgpr197
+  ; GCN-NEXT:    ; implicit-def: $vgpr169
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v230, v[64:67]
+  ; GCN-NEXT:    ds_write_b128 v231, v[64:67]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v230, v[68:71] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v231, v[68:71] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[180:183], v224, s[8:11], 0 offen offset:64 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[184:187], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v213
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v230
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v230 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v230 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[168:171], v213 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v230 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[172:175], v246
+  ; GCN-NEXT:    ds_read_b128 v[192:195], v240
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[196:199], v240 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[180:183], v246 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[200:203], v240 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[204:207], v240 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -150,293 +150,293 @@
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v230, v[160:163]
+  ; GCN-NEXT:    ds_write_b128 v231, v[180:183]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v230, v[164:167] offset:1024
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
+  ; GCN-NEXT:    ds_write_b128 v231, v[184:187] offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[154:155], v[64:79]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v224, s[8:11], 0 offen offset:128 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[180:183], v208, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[188:191], v213
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v230
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[192:195], v213 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v230 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[164:167], v213 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[208:211], v230 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[214:217], v213 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[212:215], v230 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[218:221], v246
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[192:193], v[156:157], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[216:219], v240
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[222:225], v246 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[220:223], v240 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[168:171], v246 offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[194:195], v[158:159], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[192:195], v240 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[188:191], v246 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[146:147], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v240 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
+  ; GCN-NEXT:    ds_write_b128 v231, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v230, v[160:163] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v231, v[180:183] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v224, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
-  ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[180:183], v225, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[216:217], v[140:141], v[112:127]
+  ; GCN-NEXT:    buffer_load_dwordx2 v[224:225], v226, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[226:227], v227, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_perm_b32 v234, v226, v224, s7
+  ; GCN-NEXT:    buffer_load_dwordx2 v[228:229], v228, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[156:157], v[64:79]
+  ; GCN-NEXT:    buffer_load_dwordx2 v[216:217], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
-  ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
-  ; GCN-NEXT:    v_perm_b32 v240, v162, v160, s7
-  ; GCN-NEXT:    v_perm_b32 v242, v163, v161, s5
-  ; GCN-NEXT:    v_perm_b32 v244, v163, v161, s7
-  ; GCN-NEXT:    ds_read_b128 v[160:163], v213
+  ; GCN-NEXT:    v_perm_b32 v232, v226, v224, s5
+  ; GCN-NEXT:    v_perm_b32 v236, v227, v225, s5
+  ; GCN-NEXT:    v_perm_b32 v238, v227, v225, s7
+  ; GCN-NEXT:    v_perm_b32 v233, v216, v228, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[156:157], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v235, v216, v228, s7
+  ; GCN-NEXT:    v_perm_b32 v237, v217, v229, s5
+  ; GCN-NEXT:    v_perm_b32 v239, v217, v229, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[206:207], v[158:159], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[158:159], v[96:111]
+  ; GCN-NEXT:    ds_read_b128 v[196:199], v230
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
-  ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
-  ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_perm_b32 v245, v175, v173, s7
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[218:221], v213 offset:512
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[156:157], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[202:203], v[158:159], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[188:189], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[142:143], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[146:147], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[190:191], v[146:147], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[208:209], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[196:197], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[140:141], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[140:141], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[146:147], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[198:199], v[150:151], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[196:199], v230 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[172:175], v213 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[204:207], v230 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[160:163], v213 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[212:215], v230 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[184:187], v246
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[142:143], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v240
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[214:217], v246 offset:512
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[142:143], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[192:193], v[140:141], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[194:195], v[142:143], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[150:151], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[212:215], v240 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[200:203], v240 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[160:163], v246 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[216:219], v240 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
+  ; GCN-NEXT:    ds_write_b128 v231, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v230, v[226:229] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v231, v[180:183] offset:1024
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[156:159], v213
+  ; GCN-NEXT:    ds_read_b128 v[152:155], v230
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[226:229], v213 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[180:183], v230 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[180:183], v213 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[224:227], v230 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[152:155], v213 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[150:151], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[204:205], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v230 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[230:233], v246
+  ; GCN-NEXT:    ds_read_b128 v[228:231], v240
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[234:237], v246 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v240 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[220:223], v240 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
-  ; GCN-NEXT:    ds_read_b128 v[156:159], v246 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[196:199], v240 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v199, v[238:239]
+  ; GCN-NEXT:    ds_write_b64 v172, v[232:233]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[212:213], v[136:137], v[96:111]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v200, v[240:241]
+  ; GCN-NEXT:    ds_write_b64 v173, v[234:235]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v201, v[242:243]
+  ; GCN-NEXT:    ds_write_b64 v174, v[236:237]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v202, v[244:245]
+  ; GCN-NEXT:    ds_write_b64 v175, v[238:239]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v242, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
-  ; GCN-NEXT:    buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v243, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[232:233], v244, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[234:235], v245, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[206:207], v[150:151], v[80:95]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_perm_b32 v188, v194, v192, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_perm_b32 v189, v220, v218, s5
-  ; GCN-NEXT:    v_perm_b32 v191, v220, v218, s7
-  ; GCN-NEXT:    v_perm_b32 v190, v194, v192, s7
-  ; GCN-NEXT:    v_perm_b32 v192, v195, v193, s5
-  ; GCN-NEXT:    v_perm_b32 v194, v195, v193, s7
-  ; GCN-NEXT:    v_perm_b32 v193, v221, v219, s5
-  ; GCN-NEXT:    v_perm_b32 v195, v221, v219, s7
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[132:133], v[112:127]
+  ; GCN-NEXT:    v_perm_b32 v152, v158, v156, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[138:139], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v153, v234, v232, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[136:137], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[134:135], v[112:127]
+  ; GCN-NEXT:    v_perm_b32 v154, v158, v156, s7
+  ; GCN-NEXT:    v_perm_b32 v155, v234, v232, s7
+  ; GCN-NEXT:    v_perm_b32 v156, v159, v157, s5
+  ; GCN-NEXT:    v_perm_b32 v158, v159, v157, s7
+  ; GCN-NEXT:    v_perm_b32 v157, v235, v233, s5
+  ; GCN-NEXT:    v_perm_b32 v159, v235, v233, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[180:181], v[132:133], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[202:203], v[138:139], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[136:137], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[228:229], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[182:183], v[134:135], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[224:225], v[132:133], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[218:219], v[138:139], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[130:131], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[188:189], v[128:129], v[96:111]
   ; GCN-NEXT:    s_nop 9
-  ; GCN-NEXT:    v_mul_f32_e32 v213, s4, v112
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v113
-  ; GCN-NEXT:    v_max3_f32 v213, v213, s14, v218
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v114
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v115
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v116
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v117
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v118
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v119
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v120
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v121
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v122
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v123
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v124
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v125
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v126
-  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v127
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v228, s4, v112
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v113
+  ; GCN-NEXT:    v_max3_f32 v228, v228, s14, v229
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v114
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v115
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v116
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[226:227], v[134:135], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v117
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v118
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v119
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v120
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v121
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[132:133], v[64:79]
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v122
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v123
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v124
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v125
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[190:191], v[130:131], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v229, s4, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v230, s4, v127
+  ; GCN-NEXT:    v_max3_f32 v228, v228, v229, v230
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[220:221], v[128:129], v[80:95]
   ; GCN-NEXT:    s_nop 6
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v96
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v97
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v98
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v99
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v100
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v101
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v102
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v103
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v104
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v105
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v106
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v107
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v108
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v109
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v110
-  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v111
-  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v180, s4, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v97
+  ; GCN-NEXT:    v_max3_f32 v180, v228, v180, v181
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v98
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v99
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v100
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[134:135], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v101
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v103
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v104
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v105
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[222:223], v[130:131], v[80:95]
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v106
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v107
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v108
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v109
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[196:197], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v181, s4, v110
+  ; GCN-NEXT:    v_mul_f32_e32 v182, s4, v111
+  ; GCN-NEXT:    v_max3_f32 v180, v180, v181, v182
   ; GCN-NEXT:    v_mul_f32_e32 v140, s4, v80
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v81
-  ; GCN-NEXT:    v_max3_f32 v140, v213, v140, v141
+  ; GCN-NEXT:    v_max3_f32 v140, v180, v140, v141
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v82
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[198:199], v[130:131], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v83
   ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
   ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v84
@@ -481,41 +481,35 @@
   ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v78
   ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v79
   ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
-  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v160, v128
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_max_f32_e32 v129, v129, v129
   ; GCN-NEXT:    v_max_f32_e32 v128, v128, v129
-  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v160, v128
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    v_cndmask_b32_e64 v128, v129, v128, s[12:13]
   ; GCN-NEXT:    v_max_f32_e32 v128, v128, v128
-  ; GCN-NEXT:    v_max_f32_e32 v128, v212, v128
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v113, -v128
+  ; GCN-NEXT:    v_max_f32_e32 v137, v246, v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v113, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v114, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v114, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v115, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v115, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v116, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v116, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v117, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v117, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v118, -v128
-  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v118, -v137
+  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_fma_f32 v113, s4, v119, -v128
-  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v128
-  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v119, -v137
+  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v137
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v112, 0x3fb8aa3b, v112
   ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v113
   ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v120
-  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v128
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v137
   ; GCN-NEXT:    v_exp_f32_e32 v114, v138
   ; GCN-NEXT:    v_exp_f32_e32 v115, v139
   ; GCN-NEXT:    v_exp_f32_e32 v116, v140
@@ -526,37 +520,41 @@
   ; GCN-NEXT:    v_exp_f32_e32 v120, v144
   ; GCN-NEXT:    v_exp_f32_e32 v113, v112
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v119, v114
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
-  ; GCN-NEXT:    v_sub_f32_e32 v129, v211, v128
+  ; GCN-NEXT:    v_sub_f32_e32 v128, v241, v137
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v113
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v129
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[128:131], v171
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
+  ; GCN-NEXT:    ds_read_b128 v[132:135], v171 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v122, s4, v123, -v128
   ; GCN-NEXT:    v_pack_b32_f16 v146, v112, v119
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v115
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v171 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v123, -v137
   ; GCN-NEXT:    v_mul_f32_e32 v151, 0x3fb8aa3b, v122
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
-  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v128
   ; GCN-NEXT:    v_pack_b32_f16 v147, v112, v121
-  ; GCN-NEXT:    v_exp_f32_e32 v112, v129
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v137
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v124, v118
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v122
-  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v128
-  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v119, v143
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v171 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
@@ -565,18 +563,19 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e64 v20, v20, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v21, v21, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v22, v22, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v23, v23, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v24, v24, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v25, v25, v112
-  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v123, v124
+  ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v119
+  ; GCN-NEXT:    v_fma_f32 v124, s4, v126, -v137
   ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
@@ -585,568 +584,576 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v134, v123, v124
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v119
-  ; GCN-NEXT:    v_fma_f32 v124, s4, v126, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v120
-  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v122
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[138:139], v[146:147], v[0:15]
   ; GCN-NEXT:    v_exp_f32_e32 v122, v149
-  ; GCN-NEXT:    v_pack_b32_f16 v135, v130, v126
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v137
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v128, v126
   ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v124
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v121
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
-  ; GCN-NEXT:    v_fma_f32 v139, s4, v96, -v128
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    ds_read_b128 v[126:129], v169
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v137
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v123, v150
-  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v101, -v128
-  ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v128
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v128
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v121
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v96, -v137
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v137
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v137
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[148:149], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v124, v151
-  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v169 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v122
-  ; GCN-NEXT:    v_exp_f32_e32 v96, v129
-  ; GCN-NEXT:    v_fma_f32 v137, s4, v97, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v136
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v123
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[148:149], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v97, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v122
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v142, v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[140:141], v[148:149], v[0:15]
   ; GCN-NEXT:    v_exp_f32_e32 v97, v125
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v137
-  ; GCN-NEXT:    v_fma_f32 v137, s4, v98, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v137
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v124
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v99, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v123
+  ; GCN-NEXT:    v_fma_f32 v140, s4, v98, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v180, 0x3fb8aa3b, v140
+  ; GCN-NEXT:    v_fma_f32 v140, s4, v99, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v181, 0x3fb8aa3b, v140
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[148:149], v[48:63]
   ; GCN-NEXT:    v_exp_f32_e32 v98, v138
-  ; GCN-NEXT:    v_exp_f32_e32 v99, v127
-  ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v135
-  ; GCN-NEXT:    v_pack_b32_f16 v127, v136, v134
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v124
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v135, v138
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v99, v139
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v169 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v169 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v100, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v96
-  ; GCN-NEXT:    v_exp_f32_e32 v100, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v97
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v199, v[188:189]
+  ; GCN-NEXT:    ds_write_b64 v172, v[152:153]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v200, v[190:191]
+  ; GCN-NEXT:    ds_write_b64 v173, v[154:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v201, v[192:193]
+  ; GCN-NEXT:    ds_write_b64 v174, v[156:157]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v202, v[194:195]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
-  ; GCN-NEXT:    v_exp_f32_e32 v101, v125
-  ; GCN-NEXT:    v_pack_b32_f16 v146, v130, v131
+  ; GCN-NEXT:    ds_write_b64 v175, v[158:159]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[126:127], v247, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v98
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
-  ; GCN-NEXT:    v_fma_f32 v134, s4, v102, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v134
-  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v248, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v102, v142
-  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v249, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[150:151], v250, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v96
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[130:131], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v97
+  ; GCN-NEXT:    v_fma_f32 v153, s4, v100, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v100, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v153
+  ; GCN-NEXT:    v_pack_b32_f16 v130, v152, v130
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v101, -v137
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v99
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v103, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v103, v150
-  ; GCN-NEXT:    v_fma_f32 v139, s4, v105, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v147, v147, v126
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[138:139], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v101, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v103, -v137
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v104, -v137
+  ; GCN-NEXT:    v_perm_b32 v156, v147, v127, s5
+  ; GCN-NEXT:    v_perm_b32 v152, v146, v126, s5
+  ; GCN-NEXT:    v_perm_b32 v153, v150, v148, s5
+  ; GCN-NEXT:    v_perm_b32 v155, v150, v148, s7
+  ; GCN-NEXT:    v_perm_b32 v150, v147, v127, s7
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v102, -v137
+  ; GCN-NEXT:    v_perm_b32 v154, v146, v126, s7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v98
   ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v127
-  ; GCN-NEXT:    v_perm_b32 v152, v135, v131, s5
-  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s7
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v104, -v128
-  ; GCN-NEXT:    v_perm_b32 v126, v134, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
-  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v99
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v135, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v102, v180
+  ; GCN-NEXT:    v_pack_b32_f16 v131, v126, v127
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v100
-  ; GCN-NEXT:    v_exp_f32_e32 v104, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v101
-  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    v_perm_b32 v157, v151, v149, s5
+  ; GCN-NEXT:    v_perm_b32 v151, v151, v149, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[130:131], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v103, v181
+  ; GCN-NEXT:    ds_read_b128 v[126:129], v171
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_perm_b32 v127, v144, v142, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v134, v135
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v106, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v105, v125
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v102
-  ; GCN-NEXT:    v_perm_b32 v151, v144, v142, s7
-  ; GCN-NEXT:    v_perm_b32 v153, v145, v143, s5
-  ; GCN-NEXT:    v_perm_b32 v155, v145, v143, s7
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v106, v156
-  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v135
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v103
-  ; GCN-NEXT:    v_fma_f32 v136, s4, v107, -v128
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v134, v135
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v136
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v107, v138
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v171 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v108, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v104
-  ; GCN-NEXT:    v_exp_f32_e32 v108, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v105
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v109, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[130:131], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v101
+  ; GCN-NEXT:    v_exp_f32_e32 v104, v136
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v105, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v134, v132
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v102
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[140:141], v[130:131], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v105, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v106, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[130:131], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v103
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v107, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v106, v138
+  ; GCN-NEXT:    v_exp_f32_e32 v107, v135
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v132, v130
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v171 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v171 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v108, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v104
+  ; GCN-NEXT:    v_exp_f32_e32 v108, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v105
+  ; GCN-NEXT:    v_fma_f32 v144, s4, v109, -v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v126, v127
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v110, -v137
   ; GCN-NEXT:    v_exp_f32_e32 v109, v125
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_pack_b32_f16 v142, v130, v131
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v110, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v106
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
-  ; GCN-NEXT:    v_mul_f32_e32 v134, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v107
-  ; GCN-NEXT:    v_exp_f32_e32 v110, v156
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v111, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v135, 0x3fb8aa3b, v135
-  ; GCN-NEXT:    v_pack_b32_f16 v143, v130, v131
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v111, v146
-  ; GCN-NEXT:    v_fma_f32 v139, s4, v80, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v108
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v129
-  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v109
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
-  ; GCN-NEXT:    v_fma_f32 v144, s4, v81, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v125
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v106
   ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v144
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v138, v139
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v110
-  ; GCN-NEXT:    v_fma_f32 v137, s4, v82, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v111
-  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v137
-  ; GCN-NEXT:    v_fma_f32 v137, s4, v83, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v157, 0x3fb8aa3b, v137
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v135
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v136, v134
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v130, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v107
+  ; GCN-NEXT:    v_exp_f32_e32 v110, v142
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v111, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v131, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v126, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v111, v143
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v80, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v108
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v81, -v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[126:129], v169
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v109
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v169 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v134, v135
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v110
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v82, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v111
+  ; GCN-NEXT:    v_mul_f32_e32 v180, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v83, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v181, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v132, v130
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v169 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v169 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v199, v[126:127]
+  ; GCN-NEXT:    ds_write_b64 v172, v[152:153]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v200, v[150:151]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
+  ; GCN-NEXT:    ds_write_b64 v173, v[154:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v201, v[152:153]
+  ; GCN-NEXT:    ds_write_b64 v174, v[156:157]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v202, v[154:155]
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v84, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v84, v129
-  ; GCN-NEXT:    v_fma_f32 v130, s4, v85, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v127
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v125
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    ds_write_b64 v175, v[150:151]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[126:127], v176, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v81
-  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v127
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
-  ; GCN-NEXT:    v_fma_f32 v134, s4, v86, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v134
-  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v177, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v178, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[150:151], v179, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v82
-  ; GCN-NEXT:    v_exp_f32_e32 v86, v156
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v83
+  ; GCN-NEXT:    v_fma_f32 v153, s4, v84, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v80
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v85, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v125
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v81
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_fma_f32 v139, s4, v87, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v87, v157
-  ; GCN-NEXT:    v_pack_b32_f16 v127, v127, v138
-  ; GCN-NEXT:    v_fma_f32 v138, s4, v89, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v153
+  ; GCN-NEXT:    v_pack_b32_f16 v142, v152, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v87, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v131, 0x3fb8aa3b, v131
   ; GCN-NEXT:    ; implicit-def: $sgpr0
-  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s5
-  ; GCN-NEXT:    v_perm_b32 v156, v135, v131, s7
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v88, -v128
-  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s5
-  ; GCN-NEXT:    v_perm_b32 v152, v134, v130, s7
-  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    v_add_u32_e32 v170, s0, v170
+  ; GCN-NEXT:    v_add_u32_e32 v165, s0, v165
+  ; GCN-NEXT:    v_add_u32_e32 v168, s0, v168
+  ; GCN-NEXT:    v_add_u32_e32 v167, s0, v167
+  ; GCN-NEXT:    v_add_u32_e32 v166, s0, v166
+  ; GCN-NEXT:    v_add_u32_e32 v164, s0, v164
+  ; GCN-NEXT:    v_add_u32_e32 v163, s0, v163
+  ; GCN-NEXT:    v_add_u32_e32 v162, s0, v162
+  ; GCN-NEXT:    v_add_u32_e32 v161, s0, v161
+  ; GCN-NEXT:    v_perm_b32 v158, v147, v127, s5
+  ; GCN-NEXT:    v_perm_b32 v176, v147, v127, s7
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v86, -v137
+  ; GCN-NEXT:    v_perm_b32 v154, v146, v126, s5
+  ; GCN-NEXT:    v_perm_b32 v156, v146, v126, s7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v130, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v83
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v180
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v181
+  ; GCN-NEXT:    v_pack_b32_f16 v143, v126, v127
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v88, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v84
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v89, -v137
+  ; GCN-NEXT:    v_perm_b32 v155, v150, v148, s5
+  ; GCN-NEXT:    v_perm_b32 v157, v150, v148, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[142:143], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[126:129], v171
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v84
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v135
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v85
-  ; GCN-NEXT:    v_perm_b32 v151, v146, v142, s5
-  ; GCN-NEXT:    v_perm_b32 v153, v146, v142, s7
-  ; GCN-NEXT:    v_perm_b32 v155, v147, v143, s5
-  ; GCN-NEXT:    v_perm_b32 v157, v147, v143, s7
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
+  ; GCN-NEXT:    v_perm_b32 v159, v151, v149, s5
+  ; GCN-NEXT:    v_perm_b32 v177, v151, v149, s7
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v171 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
   ; GCN-NEXT:    v_exp_f32_e32 v89, v125
-  ; GCN-NEXT:    v_pack_b32_f16 v146, v134, v135
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v86
-  ; GCN-NEXT:    v_fma_f32 v135, s4, v90, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v134, v135
   ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v138
-  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v135
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v90, v158
-  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v87
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v91, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v91, v139
-  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
-  ; GCN-NEXT:    v_pack_b32_f16 v147, v134, v126
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v86
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v90, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v87
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    v_fma_f32 v133, s4, v91, -v137
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v132, v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v133
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v171 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v171 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
-  ; GCN-NEXT:    v_fma_f32 v130, s4, v92, -v128
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v93, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v178, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v92, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v130
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v89
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v93, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v130, v126, v130
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v92, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v89
+  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v126, v136
+  ; GCN-NEXT:    v_fma_f32 v126, s4, v94, -v137
   ; GCN-NEXT:    v_exp_f32_e32 v93, v125
-  ; GCN-NEXT:    v_fma_f32 v126, s4, v94, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v125, v90
-  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v66, -v137
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v67, -v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v130, 0x3fb8aa3b, v126
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v91
-  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v94, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v93
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v95, v127
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v92
-  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_pack_b32_f16 v131, v125, v126
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v125, v129
-  ; GCN-NEXT:    ds_read_b128 v[132:135], v197
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v144
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v131, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v125, v126
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v142
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v169
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[150:153], v169 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v65
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v66, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v126, v142
-  ; GCN-NEXT:    v_pack_b32_f16 v142, v127, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v93
+  ; GCN-NEXT:    v_exp_f32_e32 v125, v127
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v128, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v126, v178
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v94
-  ; GCN-NEXT:    v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v132, 0x3fb8aa3b, v65
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v95
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v67, -v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v127, v143
-  ; GCN-NEXT:    v_pack_b32_f16 v143, v64, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v138
-  ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v66
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:1152
+  ; GCN-NEXT:    v_mul_f32_e32 v133, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_exp_f32_e32 v127, v130
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v64, v65
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v169 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[136:139], v197 offset:1728
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v169 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v131
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v199, v[150:151]
+  ; GCN-NEXT:    ds_write_b64 v172, v[154:155]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v154, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v126
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[142:143], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v129
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v200, v[152:153]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v125
-  ; GCN-NEXT:    v_exp_f32_e32 v130, v158
+  ; GCN-NEXT:    ds_write_b64 v173, v[156:157]
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v131, v68
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v201, v[154:155]
+  ; GCN-NEXT:    ds_write_b64 v174, v[158:159]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v202, v[156:157]
+  ; GCN-NEXT:    ds_write_b64 v175, v[176:177]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v126
-  ; GCN-NEXT:    v_exp_f32_e32 v131, v144
-  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v69
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v71, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v140, v132, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v129
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[150:151], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v130, v136
+  ; GCN-NEXT:    v_mul_f32_e32 v136, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[134:135], v[0:15]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v127
-  ; GCN-NEXT:    v_exp_f32_e32 v132, v145
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v70, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v132
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v70, -v137
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v71, -v137
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v64, v69
+  ; GCN-NEXT:    v_fma_f32 v71, s4, v72, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v129
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v132, v133
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v71
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v71, v130
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
-  ; GCN-NEXT:    v_fma_f32 v145, s4, v73, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v145
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v133, v141
-  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v69
-  ; GCN-NEXT:    v_pack_b32_f16 v141, v64, v68
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v198
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v72, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v130
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v146
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v131
-  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v64, v64, v143
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v144
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v132
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v74, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v65
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v133
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v65, v66, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
-  ; GCN-NEXT:    v_fma_f32 v138, s4, v75, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v142
-  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v138
-  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v72
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v76, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v76, v146
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v73
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v77, -v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
-  ; GCN-NEXT:    v_exp_f32_e32 v77, v147
-  ; GCN-NEXT:    v_pack_b32_f16 v134, v66, v68
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v78, -v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v74
-  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v69
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v78, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v76
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v75
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v79, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v79, v148
-  ; GCN-NEXT:    v_mul_f32_e32 v128, 0x3fb8aa3b, v65
-  ; GCN-NEXT:    v_pack_b32_f16 v135, v66, v64
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v146
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v197
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_fma_f32 v72, s4, v73, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[144:145], v[68:69], v[16:31]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v171
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
-  ; GCN-NEXT:    v_exp_f32_e32 v137, v147
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v77
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
-  ; GCN-NEXT:    v_exp_f32_e32 v138, v138
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v78
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
-  ; GCN-NEXT:    s_nop 10
-  ; GCN-NEXT:    v_exp_f32_e32 v52, v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v137
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v142
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v138
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v52
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v79
-  ; GCN-NEXT:    v_pack_b32_f16 v50, v51, v50
-  ; GCN-NEXT:    v_pack_b32_f16 v48, v139, v136
-  ; GCN-NEXT:    v_pack_b32_f16 v51, v54, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, 0, v113
-  ; GCN-NEXT:    v_add_f32_e32 v53, v114, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v115, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v116, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v117, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v118, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v119, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v120, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v121, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v122, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v123, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v124, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v96, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v97, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v98, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v99, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v100, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v101, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v102, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v103, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v104, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v105, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v106, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v107, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v108, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v109, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v110, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v111, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v80, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v81, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v82, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v83, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v84, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v85, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v86, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v87, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v88, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v89, v53
-  ; GCN-NEXT:    v_pack_b32_f16 v49, v140, v49
-  ; GCN-NEXT:    v_add_f32_e32 v53, v90, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v91, v53
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
-  ; GCN-NEXT:    v_add_f32_e32 v53, v92, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v93, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v94, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v95, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v125, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v126, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v127, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v129, v53
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
-  ; GCN-NEXT:    s_nop 9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v130, v53
-  ; GCN-NEXT:    v_add_f32_e32 v0, v131, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v132, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v133, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v72, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v73, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v74, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v75, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v76, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v77, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v78, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v79, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v142, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v137, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v138, v0
-  ; GCN-NEXT:    v_add_f32_e32 v4, v52, v0
-  ; GCN-NEXT:    ds_bpermute_b32 v5, v196, v4
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v196, v2
-  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v171 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[12:13]
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v112
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1728
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v133, v154
+  ; GCN-NEXT:    v_pack_b32_f16 v64, v64, v71
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[152:153], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v134, v136
+  ; GCN-NEXT:    ds_read_b128 v[150:153], v171 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[68:69], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v135, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v132
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v74, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_fma_f32 v74, s4, v77, -v137
+  ; GCN-NEXT:    v_pack_b32_f16 v65, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v136, v70
+  ; GCN-NEXT:    ds_read_b128 v[70:73], v171 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v76, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v133
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v75, -v137
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[142:143], v[64:65], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v76, v138
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v134
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v74
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v76
+  ; GCN-NEXT:    v_pack_b32_f16 v74, v66, v69
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[64:65], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v139
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v78, -v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v135
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v77
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[150:151], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v136
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v79, -v137
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_pack_b32_f16 v75, v66, v64
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v169
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v169 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[144:145], v[74:75], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v137, v138
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[74:75], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v138, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v78
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[152:153], v[74:75], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v140, v140
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[74:75], v[48:63]
+  ; GCN-NEXT:    ; implicit-def: $vgpr58
+  ; GCN-NEXT:    s_nop 10
+  ; GCN-NEXT:    v_add_f32_e32 v57, 0, v113
+  ; GCN-NEXT:    v_add_f32_e32 v57, v114, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v115, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v116, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v117, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v118, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v119, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v120, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v121, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v122, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v123, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v124, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v96, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v97, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v98, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v99, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v100, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v101, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v102, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v103, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v104, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v105, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v106, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v107, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v108, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v109, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v110, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v111, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v80, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v81, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v82, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v83, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v48, v79
+  ; GCN-NEXT:    v_add_f32_e32 v57, v84, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v85, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v86, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v87, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v52, v141, v142
+  ; GCN-NEXT:    v_pack_b32_f16 v53, v143, v48
+  ; GCN-NEXT:    v_add_f32_e32 v57, v88, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v89, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[52:53], v[16:31]
+  ; GCN-NEXT:    v_add_f32_e32 v57, v90, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v91, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v56
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v55, v140
+  ; GCN-NEXT:    v_add_f32_e32 v57, v92, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v93, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v94, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v95, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v54, v50, v49
+  ; GCN-NEXT:    v_pack_b32_f16 v55, v55, v51
+  ; GCN-NEXT:    v_add_f32_e32 v57, v125, v57
+  ; GCN-NEXT:    v_add_f32_e32 v57, v126, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[54:55], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[48:51], v169 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[48:49], v[52:53], v[0:15]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_add_f32_e32 v16, v127, v57
+  ; GCN-NEXT:    v_add_f32_e32 v16, v128, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v129, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v130, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v131, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v132, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v133, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v134, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v135, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v136, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v76, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v77, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v78, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v79, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v137, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v138, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v140, v16
+  ; GCN-NEXT:    v_add_f32_e32 v20, v56, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v169 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_bpermute_b32 v16, v160, v20
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[52:53], v[32:47]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v16, v20, v16
+  ; GCN-NEXT:    ds_bpermute_b32 v17, v160, v16
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[50:51], v[54:55], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[54:55], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    s_nop 8
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v17, v16, s[12:13]
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v58, v112
   ; GCN-NEXT:    s_endpgm
 
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..677dad6083c3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -13,21 +13,18 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr3
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr52
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
-  ; GCN-NEXT:    ; implicit-def: $vgpr49
+  ; GCN-NEXT:    ; implicit-def: $vgpr9
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35
+  ; GCN-NEXT:    ; implicit-def: $vgpr70
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
-  ; GCN-NEXT:    ; implicit-def: $vgpr77
-  ; GCN-NEXT:    ; implicit-def: $vgpr78
-  ; GCN-NEXT:    ; implicit-def: $vgpr79
-  ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
-  ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
-  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61
+  ; GCN-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69
+  ; GCN-NEXT:    ; implicit-def: $vgpr54
+  ; GCN-NEXT:    ; implicit-def: $vgpr55
+  ; GCN-NEXT:    v_add_u32_e32 v71, v54, v52
+  ; GCN-NEXT:    v_add_u32_e32 v72, v55, v52
   ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
@@ -35,459 +32,465 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
-  ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    v_add_lshl_u32 v8, v5, s4, 1
+  ; GCN-NEXT:    ; implicit-def: $vgpr5
+  ; GCN-NEXT:    ; implicit-def: $vgpr57
+  ; GCN-NEXT:    ; implicit-def: $vgpr53
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    ; kill: killed $vgpr71
+  ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    v_add_u32_e32 v5, v5, v52
+  ; GCN-NEXT:    ; kill: killed $vgpr5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
+  ; GCN-NEXT:    ds_write_b128 v8, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:64 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr0
-  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
-  ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $sgpr1
+  ; GCN-NEXT:    v_add_u32_e32 v4, v4, v52
+  ; GCN-NEXT:    buffer_load_dwordx2 v[48:49], v4, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[50:51], v5, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v49
+  ; GCN-NEXT:    ; kill: killed $vgpr4
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v9
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v49 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v9 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
-  ; GCN-NEXT:    ; kill: killed $vgpr1
-  ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[32:33], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[34:35], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v70
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v70 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v48, v[32:35]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+  ; GCN-NEXT:    ds_write_b128 v8, v[0:3]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[40:41], v[16:31]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v49
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v9
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[62:65], v9 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[42:43], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[58:59], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[60:61], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v70
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[66:67], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[68:69], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[32:33], 0
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
-  ; GCN-NEXT:    ; kill: killed $vgpr83
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ; implicit-def: $vgpr36
+  ; GCN-NEXT:    ; implicit-def: $vgpr37
+  ; GCN-NEXT:    v_add_u32_e32 v32, s20, v32
+  ; GCN-NEXT:    v_and_b32_e32 v32, 0x1fffffff, v32
+  ; GCN-NEXT:    v_mul_lo_u32 v32, v32, s1
+  ; GCN-NEXT:    v_add_lshl_u32 v54, v33, v32, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v55, v36, 1, v54
+  ; GCN-NEXT:    v_lshl_add_u32 v56, v37, 1, v55
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[34:35], v[0:15]
+  ; GCN-NEXT:    v_lshl_add_u32 v57, v57, 1, v56
+  ; GCN-NEXT:    v_perm_b32 v36, v50, v48, s2
+  ; GCN-NEXT:    v_perm_b32 v37, v50, v48, s3
+  ; GCN-NEXT:    v_perm_b32 v38, v51, v49, s2
+  ; GCN-NEXT:    v_perm_b32 v39, v51, v49, s3
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v70 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
-  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
-  ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
-  ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
-  ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v54, v36
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v55, v37
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
-  ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v27
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v87
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
-  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
-  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    ds_write_b32 v56, v38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v57, v39
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[48:49], v71, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[50:51], v72, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v36, s4, v16
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v37, s4, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v38, s4, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v39, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v36, v36, s0, v37
+  ; GCN-NEXT:    v_mul_f32_e32 v40, s4, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v41, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v42, s4, v22
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[62:63], v[58:59], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v43, s4, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v44, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v45, s4, v25
+  ; GCN-NEXT:    v_mul_f32_e32 v46, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v47, s4, v27
+  ; GCN-NEXT:    v_mul_f32_e32 v58, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v59, s4, v29
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[60:61], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v60, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v61, s4, v31
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v64, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[66:67], v[0:15]
+  ; GCN-NEXT:    v_max3_f32 v32, v36, v38, v39
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v40, v41
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v42, v43
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v44, v45
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v46, v47
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v58, v59
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v60, v61
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[68:69], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
+  ; GCN-NEXT:    s_nop 10
+  ; GCN-NEXT:    v_mul_f32_e32 v33, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v34, s4, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v35, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v36, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v33, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v37, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v38, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v35, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v39, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v40, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v37, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v41, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v42, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v39, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v43, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v44, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v41, v42
+  ; GCN-NEXT:    v_mul_f32_e32 v45, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v46, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v43, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v47, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v58, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v45, v46
+  ; GCN-NEXT:    v_max3_f32 v32, v32, v47, v58
+  ; GCN-NEXT:    ds_bpermute_b32 v33, v53, v32
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    ds_read_b128 v[58:61], v66
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_max_f32_e32 v33, v33, v33
+  ; GCN-NEXT:    v_max_f32_e32 v62, v32, v33
+  ; GCN-NEXT:    ds_bpermute_b32 v63, v53, v62
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v62, v63, v62, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v62, v62, v62
+  ; GCN-NEXT:    v_max_f32_e32 v63, v65, v62
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v63
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v63
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v63
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v63
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v63
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v63
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v63
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v69, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v70, v19
+  ; GCN-NEXT:    v_sub_f32_e32 v62, v64, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v62, 0x3fb8aa3b, v62
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v16
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v68
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v69
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v26, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v70
+  ; GCN-NEXT:    v_fma_f32 v74, s4, v27, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v62
+  ; GCN-NEXT:    v_pack_b32_f16 v64, v17, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v65, v21, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[58:59], v[64:65], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, 0, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v71
+  ; GCN-NEXT:    v_fma_f32 v59, s4, v28, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v72
+  ; GCN-NEXT:    v_fma_f32 v79, s4, v29, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v81, v73
+  ; GCN-NEXT:    v_fma_f32 v82, s4, v30, -v63
   ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v84, v75
+  ; GCN-NEXT:    v_perm_b32 v28, v50, v48, s2
+  ; GCN-NEXT:    v_perm_b32 v29, v50, v48, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v74
+  ; GCN-NEXT:    v_fma_f32 v85, s4, v31, -v63
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v66 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v67
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    v_perm_b32 v30, v51, v49, s2
+  ; GCN-NEXT:    v_perm_b32 v31, v51, v49, s3
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v48
+  ; GCN-NEXT:    v_pack_b32_f16 v48, v58, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v49, v81, v84
+  ; GCN-NEXT:    ds_read_b128 v[24:27], v67 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v54, v28
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[60:61], v[48:49], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v50, 0x3fb8aa3b, v59
+  ; GCN-NEXT:    v_mul_f32_e32 v51, 0x3fb8aa3b, v79
+  ; GCN-NEXT:    v_mul_f32_e32 v54, 0x3fb8aa3b, v82
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v55, v29
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v56, v30
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v56, v77
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v50
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v80
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v51
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v83
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v54
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v55, 0x3fb8aa3b, v85
+  ; GCN-NEXT:    v_fma_f32 v3, s4, v3, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v55
+  ; GCN-NEXT:    v_pack_b32_f16 v50, v56, v50
+  ; GCN-NEXT:    v_pack_b32_f16 v51, v51, v54
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[50:51], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v60
+  ; GCN-NEXT:    v_fma_f32 v4, s4, v4, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v61
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v5, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v78
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v79
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
+  ; GCN-NEXT:    ds_write_b32 v57, v31
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v66
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v3
+  ; GCN-NEXT:    v_pack_b32_f16 v54, v20, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v55, v1, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[54:55], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v5, 0x3fb8aa3b, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v81
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v8, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v82
+  ; GCN-NEXT:    v_fma_f32 v3, s4, v9, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v84
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v10, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v85
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v11, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v56, v0, v2
+  ; GCN-NEXT:    v_pack_b32_f16 v57, v4, v5
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[28:29], v[56:57], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v86
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v12, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v87
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v13, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v88
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v14, -v63
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v89
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v15, -v63
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v66 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v67
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v5, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_pack_b32_f16 v28, v4, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v29, v10, v6
+  ; GCN-NEXT:    v_exp_f32_e32 v66, v5
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[30:31], v[28:29], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v90
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v91
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v9
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v92
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v11
+  ; GCN-NEXT:    v_pack_b32_f16 v30, v4, v5
+  ; GCN-NEXT:    v_pack_b32_f16 v31, v6, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v66
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v95
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[30:31], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v94
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v93
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_add_u32_e32 v52, s0, v52
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v0, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v5, v1
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[58:59], v[32:47]
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:    v_mul_f32_e64 v0, v0, v62
+  ; GCN-NEXT:    v_mul_f32_e64 v1, v1, v62
+  ; GCN-NEXT:    v_mul_f32_e64 v2, v2, v62
+  ; GCN-NEXT:    v_mul_f32_e64 v3, v3, v62
+  ; GCN-NEXT:    v_mul_f32_e64 v4, v4, v62
+  ; GCN-NEXT:    v_mul_f32_e64 v5, v5, v62
+  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[62:63] op_sel_hi:[1,0]
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v16, v68, v76
+  ; GCN-NEXT:    v_add_f32_e32 v16, v69, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v70, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v71, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v72, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v73, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v75, v16
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[48:49], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[24:25], v[50:51], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v24, v77, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v67 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[26:27], v[54:55], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[20:21], v[56:57], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[22:23], v[28:29], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[30:31], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[58:59], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v20, v80, v24
+  ; GCN-NEXT:    v_add_f32_e32 v20, v83, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v74, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v60, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v61, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v78, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v79, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v81, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v82, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v84, v20
+  ; GCN-NEXT:    v_add_f32_e32 v20, v85, v20
+  ; GCN-NEXT:    v_add_f32_e32 v16, v86, v20
+  ; GCN-NEXT:    v_add_f32_e32 v16, v87, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v88, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v89, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v90, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v91, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v92, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v63, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v66, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v93, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v94, v16
+  ; GCN-NEXT:    v_add_f32_e32 v16, v95, v16
+  ; GCN-NEXT:    ds_bpermute_b32 v17, v53, v16
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_add_f32_e32 v0, v16, v17
+  ; GCN-NEXT:    ds_bpermute_b32 v0, v53, v0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_mov_b32_e32 v0, 0
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir
new file mode 100644
index 0000000000000..8721c32688571
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir
@@ -0,0 +1,118 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs -amdgpu-disable-mfma-chain-order-deps %s -o - 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=CHAIN
+
+---
+name:            test_fmha_order
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $exec
+    ; CHECK-LABEL: name: test_fmha_order
+    ; CHECK: liveins: $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %addr:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: SCHED_BARRIER 0, implicit $exec
+    ; CHECK-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+    ; CHECK-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+    ; CHECK-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+    ; CHECK-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+    ; CHECK-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+    ; CHECK-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+    ; CHECK-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+    ; CHECK-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 1, implicit $exec
+    ; CHECK-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $exec
+    ;
+    ; CHAIN-LABEL: name: test_fmha_order
+    ; CHAIN: liveins: $exec
+    ; CHAIN-NEXT: {{  $}}
+    ; CHAIN-NEXT: %addr:vgpr_32 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF
+    ; CHAIN-NEXT: SCHED_BARRIER 0, implicit $exec
+    ; CHAIN-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+    ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+    ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+    ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+    ; CHAIN-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+    ; CHAIN-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+    ; CHAIN-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+    ; CHAIN-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+    ; CHAIN-NEXT: SCHED_BARRIER 1, implicit $exec
+    ; CHAIN-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHAIN-NEXT: S_ENDPGM 0, implicit $exec
+    %addr:vgpr_32 = IMPLICIT_DEF
+    %c0:vreg_128_align2 = IMPLICIT_DEF
+    %c1:vreg_128_align2 = IMPLICIT_DEF
+    %accA:vreg_512_align2 = IMPLICIT_DEF
+    %accB:vreg_512_align2 = IMPLICIT_DEF
+    %accC:vreg_512_align2 = IMPLICIT_DEF
+    %accD:vreg_512_align2 = IMPLICIT_DEF
+
+    SCHED_BARRIER 0, implicit $exec
+
+    %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+    %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+    %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+    %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+    %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+    %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+    %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+    %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+
+    %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+
+    %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+
+    %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+    %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+
+    %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+    %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+
+    %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+    %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+
+    %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+    SCHED_BARRIER 1, implicit $exec
+    S_ENDPGM 0, implicit $exec
+
+...
+
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHAIN: {{.*}}
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index e29be2b744874..77dc5a3acbd2f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,74 +369,71 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_mov_b32 s5, s4
-; CHECK-NEXT:    v_mov_b64_e32 v[26:27], s[4:5]
+; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[16:19]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b64_e32 v[4:5], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
-; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
-; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v24, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mov_b32_e32 v8, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v9, v8
-; CHECK-NEXT:    v_mov_b32_e32 v10, v8
-; CHECK-NEXT:    v_mov_b32_e32 v11, v8
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; CHECK-NEXT:    v_mov_b64_e32 v[0:1], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT:    global_store_short v[0:1], v2, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11]
+; CHECK-NEXT:    v_mov_b64_e32 v[22:23], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[20:21], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[22:23], v[0:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    v_mov_b32_e32 v12, 0x7fc00000
+; CHECK-NEXT:    v_mov_b32_e32 v13, v12
+; CHECK-NEXT:    v_mov_b32_e32 v14, v12
+; CHECK-NEXT:    v_mov_b32_e32 v15, v12
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v9, v10
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[6:7], v[16:19]
+; CHECK-NEXT:    global_store_short v[4:5], v9, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[12:15]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
-; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v6
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
-; CHECK-NEXT:    global_store_short v[0:1], v10, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13]
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT:    s_nop 4
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v8
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v13, v10
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT:    global_store_short v[4:5], v13, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[20:21], v[0:3]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CHECK-NEXT:    global_store_short v[0:1], v6, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
+; CHECK-NEXT:    global_store_short v[4:5], v14, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[0:1], v24, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11]
+; CHECK-NEXT:    global_store_short v[4:5], v12, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[22:23], v[6:7], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[20:21], v[6:7], v[0:3]
 ; CHECK-NEXT:    s_nop 6
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
-; CHECK-NEXT:    global_store_short v[0:1], v6, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v8
+; CHECK-NEXT:    global_store_short v[4:5], v0, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    global_store_short v[0:1], v2, off
+; CHECK-NEXT:    global_store_short v[4:5], v1, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()



More information about the llvm-commits mailing list