[llvm] [AMDGPU][IGLP] SingleWaveOpt: Cache DSW Counters from PreRA + Don't collect PERMs for generic VALU SG (PR #67759)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 3 16:39:30 PDT 2023


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/67759

>From 896b6324738728c7997c83b1351856665ead4993 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 28 Sep 2023 18:11:37 -0700
Subject: [PATCH] [AMDGPU][IGLP] SingleWaveOpt: Cache DSW counters from PreRA

Change-Id: I9e03ccd39c44aa3f2cb2ca9b1c9e1adadfdbf3a1
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 105 +++++++-------
 .../AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir | 132 +++++++++---------
 .../AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir |  74 +++++-----
 3 files changed, 160 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index f60236080351a2c..d8b5d876b7016b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -900,6 +900,11 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+static unsigned DSWCount = 0;
+static unsigned DSWWithPermCount = 0;
+static unsigned DSWWithSharedVMEMCount = 0;
+static bool HasDSWCounts = false;
+
 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 private:
   // Whether the DS_READ is a predecessor of first four MFMA in region
@@ -1076,9 +1081,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
               Cache->push_back(Pred.getSUnit());
           }
         }
+
+        // If the other group has no PERM preds, then this group won't share any
+        if (!Cache->size())
+          return false;
       }
 
-      assert(Cache->size());
       auto DAG = SyncPipe[0].DAG;
       // Does the previous DS_WRITE share a V_PERM predecessor with this
       // VMEM_READ
@@ -1109,9 +1117,6 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
   unsigned MFMACount = 0;
-  unsigned DSWCount = 0;
-  unsigned DSWWithPermCount = 0;
-  unsigned DSWWithSharedVMEMCount = 0;
   unsigned DSRCount = 0;
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
@@ -1121,7 +1126,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore()) {
+      else if (I->mayStore() && !HasDSWCounts) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1133,58 +1138,62 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
       }
     }
   }
-  DSWWithPermCount = DSWithPerms.size();
-  auto I = DSWithPerms.begin();
-  auto E = DSWithPerms.end();
-
-  // Get the count of DS_WRITES with V_PERM predecessors which
-  // have loop carried dependencies (WAR) on the same VMEM_READs.
-  // We consider partial overlap as a miss -- in other words,
-  // for a given DS_W, we only consider another DS_W as matching
-  // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
-  // for every V_PERM pred of this DS_W.
-  DenseMap<MachineInstr *, SUnit *> VMEMLookup;
-  SmallVector<SUnit *, 6> Counted;
-  for (; I != E; I++) {
-    SUnit *Cand = nullptr;
-    bool MissedAny = false;
-    for (auto &Pred : (*I)->Preds) {
-      if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
-        continue;
 
-      if (Cand && llvm::is_contained(Counted, Cand))
-        break;
-
-      for (auto &Succ : Pred.getSUnit()->Succs) {
-        auto MI = Succ.getSUnit()->getInstr();
-        if (!TII->isVMEM(*MI) || !MI->mayLoad())
+  if (!HasDSWCounts) {
+    DSWWithPermCount = DSWithPerms.size();
+    auto I = DSWithPerms.begin();
+    auto E = DSWithPerms.end();
+
+    // Get the count of DS_WRITES with V_PERM predecessors which
+    // have loop carried dependencies (WAR) on the same VMEM_READs.
+    // We consider partial overlap as a miss -- in other words,
+    // for a given DS_W, we only consider another DS_W as matching
+    // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+    // for every V_PERM pred of this DS_W.
+    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+    SmallVector<SUnit *, 6> Counted;
+    for (; I != E; I++) {
+      SUnit *Cand = nullptr;
+      bool MissedAny = false;
+      for (auto &Pred : (*I)->Preds) {
+        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;
 
-        if (MissedAny || !VMEMLookup.size()) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        if (Cand && llvm::is_contained(Counted, Cand))
+          break;
 
-        if (!VMEMLookup.contains(MI)) {
-          MissedAny = true;
-          VMEMLookup[MI] = *I;
-          continue;
-        }
+        for (auto &Succ : Pred.getSUnit()->Succs) {
+          auto MI = Succ.getSUnit()->getInstr();
+          if (!TII->isVMEM(*MI) || !MI->mayLoad())
+            continue;
 
-        Cand = VMEMLookup[MI];
-        if (llvm::is_contained(Counted, Cand)) {
-          MissedAny = true;
-          break;
+          if (MissedAny || !VMEMLookup.size()) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          if (!VMEMLookup.contains(MI)) {
+            MissedAny = true;
+            VMEMLookup[MI] = *I;
+            continue;
+          }
+
+          Cand = VMEMLookup[MI];
+          if (llvm::is_contained(Counted, Cand)) {
+            MissedAny = true;
+            break;
+          }
         }
       }
-    }
-    if (!MissedAny && Cand) {
-      DSWWithSharedVMEMCount += 2;
-      Counted.push_back(Cand);
-      Counted.push_back(*I);
+      if (!MissedAny && Cand) {
+        DSWWithSharedVMEMCount += 2;
+        Counted.push_back(Cand);
+        Counted.push_back(*I);
+      }
     }
   }
+  HasDSWCounts = true;
 
   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
   SchedGroup *SG;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
index 715a88351bb92fd..f9ee80e5bdb53e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@@ -77,103 +77,103 @@ body:             |
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF8]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF9]], implicit $exec
-  ; GCN-NEXT:   dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF10]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF11]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_6:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF12]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF13]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_7:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF14]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF15]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_8:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF16]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF17]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_9:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF18]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF19]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_10:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF20]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF21]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_11:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   undef %63.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %63.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %63.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %63.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF40]], %63, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   undef %64.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %64.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %64.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   %64.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec
-  ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF40]], %64, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   undef [[V_PERM_B32_e64_:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF40]], [[V_PERM_B32_e64_]], 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
+  ; GCN-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   undef [[V_PERM_B32_e64_1:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_1:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_1:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   [[V_PERM_B32_e64_1:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec
+  ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF40]], [[V_PERM_B32_e64_1]], 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
+  ; GCN-NEXT:   [[DEF32:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF33:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF34:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF35:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF36:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF37:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
   ; GCN-NEXT:   [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_USHORT_OFFEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF38:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF39:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_ADD_U32_e32_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_18]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_19]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[DEF45]], implicit $exec
+  ; GCN-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF23]], implicit $exec
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF41]], [[DEF4]], 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_ADD_U32_e32_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_20]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_21]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF4:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[DEF46]], implicit $exec
+  ; GCN-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF22]], implicit $exec
+  ; GCN-NEXT:   [[DEF1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[DEF1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF41]], [[DEF5]], 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF5:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
   ; GCN-NEXT:   IGLP_OPT 1
-  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
-  ; GCN-NEXT:   [[V_ADD_U32_e32_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_22]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_23]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_24]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_25]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_26]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_27]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_28]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_29]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_30]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_31]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_32]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_33]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_34]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_35]], implicit $exec
+  ; GCN-NEXT:   [[DEF49:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[DEF49]], -1, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 [[DEF49]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF9]], implicit $exec
+  ; GCN-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF8]], implicit $exec
+  ; GCN-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF13]], implicit $exec
+  ; GCN-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF12]], implicit $exec
+  ; GCN-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF11]], implicit $exec
+  ; GCN-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF10]], implicit $exec
+  ; GCN-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF15]], implicit $exec
+  ; GCN-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF14]], implicit $exec
+  ; GCN-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF17]], implicit $exec
+  ; GCN-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF16]], implicit $exec
+  ; GCN-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF19]], implicit $exec
+  ; GCN-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF18]], implicit $exec
+  ; GCN-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF21]], implicit $exec
+  ; GCN-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[DEF20]], implicit $exec
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir
index 1d7f471f8e9bdb1..6fed102ed190861 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir
@@ -62,57 +62,57 @@ body:      |
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]]
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
   ; GCN-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF]]
-  ; GCN-NEXT:   undef [[DEF17]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec
-  ; GCN-NEXT:   [[DEF17]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec
-  ; GCN-NEXT:   [[DEF17]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec
-  ; GCN-NEXT:   [[DEF17]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec
+  ; GCN-NEXT:   [[DEF16:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   dead [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]]
+  ; GCN-NEXT:   undef [[DEF17:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec
+  ; GCN-NEXT:   [[DEF17:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec
+  ; GCN-NEXT:   [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec
+  ; GCN-NEXT:   [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   undef [[DEF18]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec
-  ; GCN-NEXT:   [[DEF18]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec
-  ; GCN-NEXT:   [[DEF18]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec
-  ; GCN-NEXT:   [[DEF18]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   undef [[DEF18:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec
+  ; GCN-NEXT:   [[DEF18:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec
+  ; GCN-NEXT:   [[DEF18:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec
+  ; GCN-NEXT:   [[DEF18:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec
   ; GCN-NEXT:   DS_WRITE_B128_gfx9 [[DEF5]], [[DEF18]], 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec
   ; GCN-NEXT:   [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
-  ; GCN-NEXT:   dead [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
+  ; GCN-NEXT:   dead [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   IGLP_OPT 1
-  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
-  ; GCN-NEXT:   [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_8]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_9]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_10]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_11]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[V_ADD_U32_e32_12]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_13]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_14]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_15]], implicit $exec
-  ; GCN-NEXT:   [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_16]], implicit $exec
+  ; GCN-NEXT:   [[DEF29:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[DEF29]], -1, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 [[DEF29]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF21]], implicit $exec
+  ; GCN-NEXT:   [[DEF33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF33]], implicit $exec
+  ; GCN-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF23]], implicit $exec
+  ; GCN-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF22]], implicit $exec
+  ; GCN-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[DEF6]], implicit $exec
+  ; GCN-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF27]], implicit $exec
+  ; GCN-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF26]], implicit $exec
+  ; GCN-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF25]], implicit $exec
+  ; GCN-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[DEF24]], implicit $exec
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:



More information about the llvm-commits mailing list