[clang] b0f4678 - [AMDGPU] Add iglp_opt builtin and MFMA GEMM Opt strategy

Austin Kerbow via cfe-commits cfe-commits at lists.llvm.org
Fri Aug 19 15:50:33 PDT 2022


Author: Austin Kerbow
Date: 2022-08-19T15:38:36-07:00
New Revision: b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb

URL: https://github.com/llvm/llvm-project/commit/b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb
DIFF: https://github.com/llvm/llvm-project/commit/b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb.diff

LOG: [AMDGPU] Add iglp_opt builtin and MFMA GEMM Opt strategy

Adds a builtin that serves as an optimization hint to apply specific optimized
DAG mutations during scheduling. This also disables any other mutations or
clustering that may interfere with the desired pipeline. The first optimization
strategy that is added here is designed to improve the performance of small gemm
kernels on gfx90a.

Reviewed By: jrbyrnes

Differential Revision: https://reviews.llvm.org/D132079

Added: 
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/test/CodeGenOpenCL/builtins-amdgcn.cl
    clang/test/SemaOpenCL/builtins-amdgcn-error.cl
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
    llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
    llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
    llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir

Removed: 
    llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll
    llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 618d5562e5093..b6e60c26082da 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -64,6 +64,7 @@ BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
 BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
 BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
 BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n")

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 444b65a83719b..9696f3536e2f6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -422,6 +422,19 @@ void test_sched_group_barrier()
   __builtin_amdgcn_sched_group_barrier(15, 10000, -1);
 }
 
+// CHECK-LABEL: @test_iglp_opt
+// CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
+// CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
+// CHECK: call void @llvm.amdgcn.iglp.opt(i32 4)
+// CHECK: call void @llvm.amdgcn.iglp.opt(i32 15)
+void test_iglp_opt()
+{
+  __builtin_amdgcn_iglp_opt(0);
+  __builtin_amdgcn_iglp_opt(1);
+  __builtin_amdgcn_iglp_opt(4);
+  __builtin_amdgcn_iglp_opt(15);
+}
+
 // CHECK-LABEL: @test_s_sleep
 // CHECK: call void @llvm.amdgcn.s.sleep(i32 1)
 // CHECK: call void @llvm.amdgcn.s.sleep(i32 15)

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
index dd296e3854973..b044763edcf00 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl
@@ -72,6 +72,11 @@ void test_sched_group_barrier(int x)
   __builtin_amdgcn_sched_group_barrier(0, 1, x); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}}
 }
 
+void test_iglp_opt(int x)
+{
+  __builtin_amdgcn_iglp_opt(x); // expected-error {{argument to '__builtin_amdgcn_iglp_opt' must be a constant integer}}
+}
+
 void test_sicmp_i32(global ulong* out, int a, int b, uint c)
 {
   *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ced9338c08477..189780d6407bd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -254,6 +254,12 @@ def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_
   [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
    IntrConvergent, IntrWillReturn]>;
 
+// Scheduler optimization hint.
+//     MASK = 0: Small gemm opt
+def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn]>;
+
 def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 360fc65e63a74..3128c1a9a3f42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -31,12 +31,6 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool>
-    EnableIGroupLP("amdgpu-igrouplp",
-                   cl::desc("Enable construction of Instruction Groups and "
-                            "their ordering for scheduling"),
-                   cl::init(false));
-
 static cl::opt<bool> EnableExactSolver(
     "amdgpu-igrouplp-exact-solver", cl::Hidden,
     cl::desc("Whether to use the exponential time solver to fit "
@@ -106,7 +100,10 @@ class SchedGroup {
   int SyncID = 0;
 
   // SGID is used to map instructions to candidate SchedGroups
-  int SGID;
+  unsigned SGID;
+
+  // Count of the number of created SchedGroups, used to initialize SGID.
+  static unsigned NumSchedGroups;
 
   ScheduleDAGInstrs *DAG;
 
@@ -180,18 +177,22 @@ class SchedGroup {
 
   SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {}
+      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
+    SGID = NumSchedGroups++;
+  }
 
   SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID,
-             int SGID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), SGID(SGID), DAG(DAG),
-        TII(TII) {}
+             ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
+    SGID = NumSchedGroups++;
+  }
 };
 
 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
   assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
-         SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
+         SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+         SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
 
   while (!SU.Preds.empty())
     for (auto &P : SU.Preds)
@@ -725,31 +726,107 @@ void PipelineSolver::solve() {
   makePipeline();
 }
 
-class IGroupLPDAGMutation : public ScheduleDAGMutation {
-private:
-  // Organize lists of SchedGroups by their SyncID. SchedGroups /
-  // SCHED_GROUP_BARRIERs with 
diff erent SyncIDs will have no edges added
-  // between then.
-  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
+enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
 
-  // The number of created sched groups -- also used as SGID
-  int NumCreatedSchedGroups = 0;
+// Implement a IGLP scheduling strategy.
+class IGLPStrategy {
+protected:
+  ScheduleDAGInstrs *DAG;
 
-  // Used to track instructions that can be mapped to multiple sched groups
-  DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
+  const SIInstrInfo *TII;
 
 public:
-  const SIInstrInfo *TII;
-  ScheduleDAGMI *DAG;
+  // Add SchedGroups to \p Pipeline to implement this Strategy.
+  virtual void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
 
-  IGroupLPDAGMutation() = default;
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+  // Returns true if this strategy should be applied to a ScheduleDAG.
+  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
+
+  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : DAG(DAG), TII(TII) {}
+
+  virtual ~IGLPStrategy() = default;
 };
 
-// DAG mutation that coordinates with the SCHED_BARRIER instruction and
-// corresponding builtin. The mutation adds edges from specific instruction
-// classes determined by the SCHED_BARRIER mask so that they cannot be
-class SchedBarrierDAGMutation : public ScheduleDAGMutation {
+class MFMASmallGemmOpt final : public IGLPStrategy {
+public:
+  void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {}
+};
+
+void MFMASmallGemmOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+  // Count the number of MFMA instructions.
+  unsigned MFMACount = 0;
+  for (auto I = DAG->begin(), E = DAG->end(); I != E; ++I) {
+    if (TII->isMFMA(*I))
+      ++MFMACount;
+  }
+
+  const unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+  for (unsigned I = 0; I < MFMACount; ++I) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  for (unsigned I = 0; I < MFMACount; ++I) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+}
+
+static std::unique_ptr<IGLPStrategy>
+createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
+                   const SIInstrInfo *TII) {
+  switch (ID) {
+  case MFMASmallGemmOptID:
+    return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+  }
+
+  llvm_unreachable("Unknown IGLPStrategyID");
+}
+
+class IGroupLPDAGMutation : public ScheduleDAGMutation {
 private:
   const SIInstrInfo *TII;
 
@@ -760,9 +837,6 @@ class SchedBarrierDAGMutation : public ScheduleDAGMutation {
   // between then.
   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
 
-  // The number of create sched groups -- also used as SGID
-  int NumCreatedSchedGroups = 0;
-
   // Used to track instructions that can be mapped to multiple sched groups
   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
 
@@ -784,12 +858,16 @@ class SchedBarrierDAGMutation : public ScheduleDAGMutation {
   void initSchedGroupBarrierPipelineStage(
       std::vector<SUnit>::reverse_iterator RIter);
 
+  void initIGLPOpt(SUnit &SU);
+
 public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
-  SchedBarrierDAGMutation() = default;
+  IGroupLPDAGMutation() = default;
 };
 
+unsigned SchedGroup::NumSchedGroups = 0;
+
 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
   if (A != B && DAG->canAddEdge(B, A)) {
     DAG->addEdge(B, SDep(A, SDep::Artificial));
@@ -960,88 +1038,44 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
 }
 
 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
-  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
-
-  // IGroupLP and sched_group_barrier are mutually exclusive mutations.
-  // Check for sched_group_barriers as that mutation gets priority.
-  for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
-    if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) {
-      return;
-    }
-  }
-
-  SyncedSchedGroups.clear();
-  SyncedInstrs.clear();
-  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
-  if (!TSchedModel || DAG->SUnits.empty())
-    return;
-
-  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
-
-  // The order of InstructionGroups in this vector defines the
-  // order in which edges will be added. In other words, given the
-  // present ordering, we will try to make each VMEMRead instruction
-  // a predecessor of each DSRead instruction, and so on.
-
-  struct SGParams {
-    SchedGroupMask Mask;
-    Optional<unsigned> Size;
-    int SyncID;
-
-    SGParams(SchedGroupMask Mask, Optional<unsigned> Size, int SyncID)
-        : Mask(Mask), Size(Size), SyncID(SyncID) {}
-  };
-
-  SmallVector<SGParams, 16> PipelineOrderGroups;
-
-  for (size_t i = 0; i < DAG->SUnits.size() / 4; i++) {
-    PipelineOrderGroups.push_back({SchedGroupMask::DS_READ, 8, 0});
-    PipelineOrderGroups.push_back({SchedGroupMask::MFMA, 1, 0});
-    PipelineOrderGroups.push_back({SchedGroupMask::DS_WRITE, 8, 0});
-  }
-
-  auto I = PipelineOrderGroups.rbegin();
-  auto E = PipelineOrderGroups.rend();
-  for (; I < E; I++) {
-    auto &SG = SyncedSchedGroups[I->SyncID].emplace_back(
-        I->Mask, I->Size, I->SyncID, NumCreatedSchedGroups++, DAG, TII);
-    SG.initSchedGroup(SyncedInstrs[SG.getSyncID()]);
-  }
-
-  PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
-  // PipelineSolver performs the mutation by adding the edges it
-  // determined as the best
-  PS.solve();
-}
-
-void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
   if (!TSchedModel || DAGInstrs->SUnits.empty())
     return;
 
-  LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
+  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
   SyncedSchedGroups.clear();
   SyncedInstrs.clear();
+  bool foundSB = false;
+  bool foundIGLP = false;
   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
-    if (R->getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
+    unsigned Opc = R->getInstr()->getOpcode();
+    // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
+    if (Opc == AMDGPU::SCHED_BARRIER) {
       addSchedBarrierEdges(*R);
-
-    else if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+      foundSB = true;
+    } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
       initSchedGroupBarrierPipelineStage(R);
+      foundSB = true;
+    } else if (Opc == AMDGPU::IGLP_OPT) {
+      resetEdges(*R, DAG);
+      if (!foundSB && !foundIGLP)
+        initIGLPOpt(*R);
+      foundIGLP = true;
+    }
   }
 
-  PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
-  // PipelineSolver performs the mutation by adding the edges it
-  // determined as the best
-  PS.solve();
+  if (foundSB || foundIGLP) {
+    PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
+    // PipelineSolver performs the mutation by adding the edges it
+    // determined as the best
+    PS.solve();
+  }
 }
 
-void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
+void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
   MachineInstr &MI = *SchedBarrier.getInstr();
   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
   // Remove all existing edges from the SCHED_BARRIER that were added due to the
@@ -1059,7 +1093,7 @@ void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
 }
 
 SchedGroupMask
-SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
+IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   // Invert mask and erase bits for types of instructions that are implied to be
   // allowed past the SCHED_BARRIER.
   SchedGroupMask InvertedMask = ~Mask;
@@ -1093,7 +1127,7 @@ SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   return InvertedMask;
 }
 
-void SchedBarrierDAGMutation::initSchedGroupBarrierPipelineStage(
+void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
     std::vector<SUnit>::reverse_iterator RIter) {
   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
   // to the instruction having side effects.
@@ -1104,22 +1138,26 @@ void SchedBarrierDAGMutation::initSchedGroupBarrierPipelineStage(
   int32_t Size = SGB.getOperand(1).getImm();
   int32_t SyncID = SGB.getOperand(2).getImm();
 
-  auto &SG = SyncedSchedGroups[SyncID].emplace_back(
-      (SchedGroupMask)SGMask, Size, SyncID, NumCreatedSchedGroups++, DAG, TII);
+  auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
+                                                    Size, SyncID, DAG, TII);
 
   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
 }
 
+void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
+  IGLPStrategyID StrategyID =
+      (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
+  auto S = createIGLPStrategy(StrategyID, DAG, TII);
+  if (S->shouldApplyStrategy(DAG))
+    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+}
+
 } // namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
-  return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
-}
-
-std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
-  return std::make_unique<SchedBarrierDAGMutation>();
+  return std::make_unique<IGroupLPDAGMutation>();
 }
 
 } // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aeb1bbad37057..ae0faba0780d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -15,7 +15,6 @@
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
-std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation();
 
 } // namespace llvm
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index db2cfe6d8b7d1..45fbc84f6f3f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -234,6 +234,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->getOpcode() == AMDGPU::IGLP_OPT) {
+      if (isVerbose()) {
+        std::string HexString;
+        raw_string_ostream HexStream(HexString);
+        HexStream << format_hex(MI->getOperand(0).getImm(), 10, true);
+        OutStreamer->emitRawComment(" iglp_opt mask(" + HexString + ")");
+      }
+      return;
+    }
+
     if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
       if (isVerbose())
         OutStreamer->emitRawComment(" divergent unreachable");

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 103a72b1aaa3f..5680c3e72e240 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -425,7 +425,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createIGroupLPDAGMutation());
-  DAG->addMutation(createSchedBarrierDAGMutation());
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -436,7 +435,6 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
   DAG->addMutation(createIGroupLPDAGMutation());
-  DAG->addMutation(createSchedBarrierDAGMutation());
   return DAG;
 }
 
@@ -939,14 +937,15 @@ class GCNPassConfig final : public AMDGPUPassConfig {
 
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
-    ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+    ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
+        C, std::make_unique<PostGenericScheduler>(C),
+        /*RemoveKillFlags=*/true);
     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(createIGroupLPDAGMutation());
-    DAG->addMutation(createSchedBarrierDAGMutation());
     if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f8f1d029624d7..6bf4ade399b77 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -24,6 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNSchedStrategy.h"
+#include "AMDGPUIGroupLP.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 
@@ -31,7 +32,7 @@
 
 using namespace llvm;
 
-cl::opt<bool>
+static cl::opt<bool>
     DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
                            cl::Hidden,
                            cl::desc("Disable unclustred high register pressure "
@@ -570,10 +571,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   RegionsWithHighRP.resize(Regions.size());
   RegionsWithExcessRP.resize(Regions.size());
   RegionsWithMinOcc.resize(Regions.size());
+  RegionsWithIGLPInstrs.resize(Regions.size());
   RescheduleRegions.set();
   RegionsWithHighRP.reset();
   RegionsWithExcessRP.reset();
   RegionsWithMinOcc.reset();
+  RegionsWithIGLPInstrs.reset();
 
   runSchedStages();
 }
@@ -655,6 +658,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
+  DAG.addMutation(createIGroupLPDAGMutation());
+
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
   // stage. Temporarily increase occupancy target in the region.
@@ -760,8 +765,18 @@ bool GCNSchedStage::initGCNRegion() {
   // Save original instruction order before scheduling for possible revert.
   Unsched.clear();
   Unsched.reserve(DAG.NumRegionInstrs);
-  for (auto &I : DAG)
-    Unsched.push_back(&I);
+  if (StageID == GCNSchedStageID::OccInitialSchedule ||
+      StageID == GCNSchedStageID::ILPInitialSchedule) {
+    for (auto &I : DAG) {
+      Unsched.push_back(&I);
+      if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+          I.getOpcode() == AMDGPU::IGLP_OPT)
+        DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
+    }
+  } else {
+    for (auto &I : DAG)
+      Unsched.push_back(&I);
+  }
 
   PressureBefore = DAG.Pressure[RegionIdx];
 
@@ -774,6 +789,13 @@ bool GCNSchedStage::initGCNRegion() {
 
   S.HasHighPressure = false;
 
+  if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
+      StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
+    SavedMutations.clear();
+    SavedMutations.swap(DAG.Mutations);
+    DAG.addMutation(createIGroupLPDAGMutation());
+  }
+
   return true;
 }
 
@@ -829,6 +851,10 @@ void GCNSchedStage::finalizeGCNRegion() {
   // reason that the original schedule is better.
   checkScheduling();
 
+  if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
+      StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
+    SavedMutations.swap(DAG.Mutations);
+
   DAG.exitRegion();
   RegionIdx++;
 }
@@ -1316,3 +1342,34 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
     }
   }
 }
+
+static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
+  return std::any_of(
+      DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
+        unsigned Opc = MI->getOpcode();
+        return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
+      });
+}
+
+GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
+    MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
+    bool RemoveKillFlags)
+    : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
+
+void GCNPostScheduleDAGMILive::schedule() {
+  HasIGLPInstrs = hasIGLPInstrs(this);
+  if (HasIGLPInstrs) {
+    SavedMutations.clear();
+    SavedMutations.swap(Mutations);
+    addMutation(createIGroupLPDAGMutation());
+  }
+
+  ScheduleDAGMI::schedule();
+}
+
+void GCNPostScheduleDAGMILive::finalizeSchedule() {
+  if (HasIGLPInstrs)
+    SavedMutations.swap(Mutations);
+
+  ScheduleDAGMI::finalizeSchedule();
+}

diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94d14312d4197..2249138c7075a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -162,6 +162,9 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Regions that has the same occupancy as the latest MinOccupancy
   BitVector RegionsWithMinOcc;
 
+  // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT).
+  BitVector RegionsWithIGLPInstrs;
+
   // Region live-in cache.
   SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
 
@@ -231,6 +234,8 @@ class GCNSchedStage {
   // RP after scheduling the current region.
   GCNRegPressure PressureAfter;
 
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
   GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
 
 public:
@@ -278,8 +283,6 @@ class OccInitialScheduleStage : public GCNSchedStage {
 
 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
-
   // Save the initial occupancy before starting this stage.
   unsigned InitialOccupancy;
 
@@ -355,6 +358,22 @@ class ILPInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
+private:
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
+  bool HasIGLPInstrs = false;
+
+public:
+  void schedule() override;
+
+  void finalizeSchedule() override;
+
+  GCNPostScheduleDAGMILive(MachineSchedContext *C,
+                           std::unique_ptr<MachineSchedStrategy> S,
+                           bool RemoveKillFlags);
+};
+
 } // End namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1ca22284b183c..af9e0a796bbcb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -344,6 +344,19 @@ def SCHED_GROUP_BARRIER : SPseudoInstSI<
   let isMeta = 1;
 }
 
+def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
+  [(int_amdgcn_iglp_opt (i32 timm:$mask))]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
+  let isMeta = 1;
+}
+
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 

diff  --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 13a6a718f4f2f..afa857c419f43 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -131,6 +131,17 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
+    bool HasIGLPInstrs =
+        std::any_of(MBB.instr_begin(), MBB.instr_end(), [](MachineInstr &MI) {
+          unsigned Opc = MI.getOpcode();
+          return (Opc == AMDGPU::SCHED_GROUP_BARRIER ||
+                  Opc == AMDGPU::IGLP_OPT);
+        });
+
+    // Don't cluster with IGLP instructions.
+    if (HasIGLPInstrs)
+      continue;
+
     MachineBasicBlock::instr_iterator Next;
     MachineBasicBlock::instr_iterator B = MBB.instr_begin();
     MachineBasicBlock::instr_iterator E = MBB.instr_end();

diff  --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll
deleted file mode 100644
index 743f03179f3f0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.ll
+++ /dev/null
@@ -1,277 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-igrouplp=1 < %s | FileCheck -check-prefix=GREEDY %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-igrouplp-exact-solver -amdgpu-igrouplp=1 < %s | FileCheck -check-prefix=EXACT %s
-
-define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 {
-; GREEDY-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
-; GREEDY:       ; %bb.0: ; %entry
-; GREEDY-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GREEDY-NEXT:    v_lshlrev_b32_e32 v33, 7, v0
-; GREEDY-NEXT:    v_mov_b32_e32 v34, 1.0
-; GREEDY-NEXT:    v_mov_b32_e32 v35, 2.0
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_add_u32_e32 v32, s0, v33
-; GREEDY-NEXT:    ds_read_b128 v[28:31], v32 offset:112
-; GREEDY-NEXT:    ds_read_b128 v[24:27], v32 offset:96
-; GREEDY-NEXT:    ds_read_b128 v[20:23], v32 offset:80
-; GREEDY-NEXT:    ds_read_b128 v[16:19], v32 offset:64
-; GREEDY-NEXT:    ds_read_b128 v[0:3], v32
-; GREEDY-NEXT:    ds_read_b128 v[4:7], v32 offset:16
-; GREEDY-NEXT:    ds_read_b128 v[8:11], v32 offset:32
-; GREEDY-NEXT:    ds_read_b128 v[12:15], v32 offset:48
-; GREEDY-NEXT:    v_add_u32_e32 v33, s1, v33
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 2
-; GREEDY-NEXT:    ds_write_b128 v33, v[28:31] offset:112
-; GREEDY-NEXT:    ds_write_b128 v33, v[24:27] offset:96
-; GREEDY-NEXT:    ds_write_b128 v33, v[20:23] offset:80
-; GREEDY-NEXT:    ds_write_b128 v33, v[16:19] offset:64
-; GREEDY-NEXT:    ds_write_b128 v33, v[12:15] offset:48
-; GREEDY-NEXT:    ds_write_b128 v33, v[8:11] offset:32
-; GREEDY-NEXT:    ds_write_b128 v33, v[4:7] offset:16
-; GREEDY-NEXT:    ds_write_b128 v33, v[0:3]
-; GREEDY-NEXT:    ds_read_b128 v[64:67], v32 offset:8304
-; GREEDY-NEXT:    ds_read_b128 v[60:63], v32 offset:8288
-; GREEDY-NEXT:    ds_read_b128 v[56:59], v32 offset:8272
-; GREEDY-NEXT:    ds_read_b128 v[52:55], v32 offset:8256
-; GREEDY-NEXT:    ds_read_b128 v[48:51], v32 offset:8240
-; GREEDY-NEXT:    ds_read_b128 v[44:47], v32 offset:8224
-; GREEDY-NEXT:    ds_read_b128 v[40:43], v32 offset:8208
-; GREEDY-NEXT:    ds_read_b128 v[36:39], v32 offset:8192
-; GREEDY-NEXT:    v_mov_b32_e32 v0, s1
-; GREEDY-NEXT:    v_add_u32_e32 v1, 0x6000, v32
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 2
-; GREEDY-NEXT:    ds_write_b128 v0, v[60:63] offset:8288
-; GREEDY-NEXT:    ds_write_b128 v0, v[64:67] offset:8304
-; GREEDY-NEXT:    ds_write_b128 v0, v[52:55] offset:8256
-; GREEDY-NEXT:    ds_write_b128 v0, v[56:59] offset:8272
-; GREEDY-NEXT:    ds_write_b128 v0, v[44:47] offset:8224
-; GREEDY-NEXT:    ds_write_b128 v0, v[48:51] offset:8240
-; GREEDY-NEXT:    ds_write_b128 v0, v[36:39] offset:8192
-; GREEDY-NEXT:    ds_write_b128 v0, v[40:43] offset:8208
-; GREEDY-NEXT:    ds_read_b128 v[64:67], v32 offset:24688
-; GREEDY-NEXT:    ds_read_b128 v[60:63], v32 offset:24672
-; GREEDY-NEXT:    ds_read_b128 v[56:59], v32 offset:24656
-; GREEDY-NEXT:    ds_read_b128 v[52:55], v32 offset:24640
-; GREEDY-NEXT:    ds_read_b128 v[48:51], v32 offset:24624
-; GREEDY-NEXT:    ds_read_b128 v[44:47], v32 offset:24608
-; GREEDY-NEXT:    ds_read_b128 v[40:43], v32 offset:24592
-; GREEDY-NEXT:    ds_read_b128 v[36:39], v32 offset:24576
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 2
-; GREEDY-NEXT:    ds_write_b128 v0, v[60:63] offset:16480
-; GREEDY-NEXT:    ds_write_b128 v0, v[64:67] offset:16496
-; GREEDY-NEXT:    ds_write_b128 v0, v[52:55] offset:16448
-; GREEDY-NEXT:    ds_write_b128 v0, v[56:59] offset:16464
-; GREEDY-NEXT:    ds_write_b128 v0, v[44:47] offset:16416
-; GREEDY-NEXT:    ds_write_b128 v0, v[48:51] offset:16432
-; GREEDY-NEXT:    ds_write_b128 v0, v[36:39] offset:16384
-; GREEDY-NEXT:    ds_write_b128 v0, v[40:43] offset:16400
-; GREEDY-NEXT:    ds_read_b128 v[64:67], v32 offset:49264
-; GREEDY-NEXT:    ds_read_b128 v[60:63], v32 offset:49248
-; GREEDY-NEXT:    ds_read_b128 v[56:59], v32 offset:49232
-; GREEDY-NEXT:    ds_read_b128 v[52:55], v32 offset:49216
-; GREEDY-NEXT:    ds_read_b128 v[48:51], v32 offset:49200
-; GREEDY-NEXT:    ds_read_b128 v[44:47], v32 offset:49184
-; GREEDY-NEXT:    ds_read_b128 v[40:43], v32 offset:49168
-; GREEDY-NEXT:    ds_read_b128 v[36:39], v32 offset:49152
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 2
-; GREEDY-NEXT:    ds_write_b128 v0, v[60:63] offset:24672
-; GREEDY-NEXT:    ds_write_b128 v0, v[64:67] offset:24688
-; GREEDY-NEXT:    ds_write_b128 v0, v[52:55] offset:24640
-; GREEDY-NEXT:    ds_write_b128 v0, v[56:59] offset:24656
-; GREEDY-NEXT:    ds_write_b128 v0, v[44:47] offset:24608
-; GREEDY-NEXT:    ds_write_b128 v0, v[48:51] offset:24624
-; GREEDY-NEXT:    ds_write_b128 v0, v[36:39] offset:24576
-; GREEDY-NEXT:    ds_write_b128 v0, v[40:43] offset:24592
-; GREEDY-NEXT:    ds_read_b128 v[30:33], v1 offset:57456
-; GREEDY-NEXT:    ds_read_b128 v[26:29], v1 offset:57440
-; GREEDY-NEXT:    ds_read_b128 v[22:25], v1 offset:57424
-; GREEDY-NEXT:    ds_read_b128 v[18:21], v1 offset:57408
-; GREEDY-NEXT:    ds_read_b128 v[2:5], v1 offset:57344
-; GREEDY-NEXT:    ds_read_b128 v[6:9], v1 offset:57360
-; GREEDY-NEXT:    ds_read_b128 v[10:13], v1 offset:57376
-; GREEDY-NEXT:    ds_read_b128 v[14:17], v1 offset:57392
-; GREEDY-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY-NEXT:    v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 7
-; GREEDY-NEXT:    s_nop 2
-; GREEDY-NEXT:    ds_write_b128 v0, v[26:29] offset:32864
-; GREEDY-NEXT:    ds_write_b128 v0, v[30:33] offset:32880
-; GREEDY-NEXT:    ds_write_b128 v0, v[18:21] offset:32832
-; GREEDY-NEXT:    ds_write_b128 v0, v[22:25] offset:32848
-; GREEDY-NEXT:    ds_write_b128 v0, v[10:13] offset:32800
-; GREEDY-NEXT:    ds_write_b128 v0, v[14:17] offset:32816
-; GREEDY-NEXT:    ds_write_b128 v0, v[2:5] offset:32768
-; GREEDY-NEXT:    ds_write_b128 v0, v[6:9] offset:32784
-; GREEDY-NEXT:    s_endpgm
-;
-; EXACT-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
-; EXACT:       ; %bb.0: ; %entry
-; EXACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; EXACT-NEXT:    v_lshlrev_b32_e32 v33, 7, v0
-; EXACT-NEXT:    v_mov_b32_e32 v34, 1.0
-; EXACT-NEXT:    v_mov_b32_e32 v35, 2.0
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_add_u32_e32 v32, s0, v33
-; EXACT-NEXT:    ds_read_b128 v[28:31], v32 offset:112
-; EXACT-NEXT:    ds_read_b128 v[24:27], v32 offset:96
-; EXACT-NEXT:    ds_read_b128 v[20:23], v32 offset:80
-; EXACT-NEXT:    ds_read_b128 v[16:19], v32 offset:64
-; EXACT-NEXT:    ds_read_b128 v[0:3], v32
-; EXACT-NEXT:    ds_read_b128 v[4:7], v32 offset:16
-; EXACT-NEXT:    ds_read_b128 v[8:11], v32 offset:32
-; EXACT-NEXT:    ds_read_b128 v[12:15], v32 offset:48
-; EXACT-NEXT:    v_add_u32_e32 v33, s1, v33
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 2
-; EXACT-NEXT:    ds_write_b128 v33, v[28:31] offset:112
-; EXACT-NEXT:    ds_write_b128 v33, v[24:27] offset:96
-; EXACT-NEXT:    ds_write_b128 v33, v[20:23] offset:80
-; EXACT-NEXT:    ds_write_b128 v33, v[16:19] offset:64
-; EXACT-NEXT:    ds_write_b128 v33, v[12:15] offset:48
-; EXACT-NEXT:    ds_write_b128 v33, v[8:11] offset:32
-; EXACT-NEXT:    ds_write_b128 v33, v[4:7] offset:16
-; EXACT-NEXT:    ds_write_b128 v33, v[0:3]
-; EXACT-NEXT:    ds_read_b128 v[64:67], v32 offset:8304
-; EXACT-NEXT:    ds_read_b128 v[60:63], v32 offset:8288
-; EXACT-NEXT:    ds_read_b128 v[56:59], v32 offset:8272
-; EXACT-NEXT:    ds_read_b128 v[52:55], v32 offset:8256
-; EXACT-NEXT:    ds_read_b128 v[48:51], v32 offset:8240
-; EXACT-NEXT:    ds_read_b128 v[44:47], v32 offset:8224
-; EXACT-NEXT:    ds_read_b128 v[40:43], v32 offset:8208
-; EXACT-NEXT:    ds_read_b128 v[36:39], v32 offset:8192
-; EXACT-NEXT:    v_mov_b32_e32 v0, s1
-; EXACT-NEXT:    v_add_u32_e32 v1, 0x6000, v32
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 2
-; EXACT-NEXT:    ds_write_b128 v0, v[60:63] offset:8288
-; EXACT-NEXT:    ds_write_b128 v0, v[64:67] offset:8304
-; EXACT-NEXT:    ds_write_b128 v0, v[52:55] offset:8256
-; EXACT-NEXT:    ds_write_b128 v0, v[56:59] offset:8272
-; EXACT-NEXT:    ds_write_b128 v0, v[44:47] offset:8224
-; EXACT-NEXT:    ds_write_b128 v0, v[48:51] offset:8240
-; EXACT-NEXT:    ds_write_b128 v0, v[36:39] offset:8192
-; EXACT-NEXT:    ds_write_b128 v0, v[40:43] offset:8208
-; EXACT-NEXT:    ds_read_b128 v[64:67], v32 offset:24688
-; EXACT-NEXT:    ds_read_b128 v[60:63], v32 offset:24672
-; EXACT-NEXT:    ds_read_b128 v[56:59], v32 offset:24656
-; EXACT-NEXT:    ds_read_b128 v[52:55], v32 offset:24640
-; EXACT-NEXT:    ds_read_b128 v[48:51], v32 offset:24624
-; EXACT-NEXT:    ds_read_b128 v[44:47], v32 offset:24608
-; EXACT-NEXT:    ds_read_b128 v[40:43], v32 offset:24592
-; EXACT-NEXT:    ds_read_b128 v[36:39], v32 offset:24576
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 2
-; EXACT-NEXT:    ds_write_b128 v0, v[60:63] offset:16480
-; EXACT-NEXT:    ds_write_b128 v0, v[64:67] offset:16496
-; EXACT-NEXT:    ds_write_b128 v0, v[52:55] offset:16448
-; EXACT-NEXT:    ds_write_b128 v0, v[56:59] offset:16464
-; EXACT-NEXT:    ds_write_b128 v0, v[44:47] offset:16416
-; EXACT-NEXT:    ds_write_b128 v0, v[48:51] offset:16432
-; EXACT-NEXT:    ds_write_b128 v0, v[36:39] offset:16384
-; EXACT-NEXT:    ds_write_b128 v0, v[40:43] offset:16400
-; EXACT-NEXT:    ds_read_b128 v[64:67], v32 offset:49264
-; EXACT-NEXT:    ds_read_b128 v[60:63], v32 offset:49248
-; EXACT-NEXT:    ds_read_b128 v[56:59], v32 offset:49232
-; EXACT-NEXT:    ds_read_b128 v[52:55], v32 offset:49216
-; EXACT-NEXT:    ds_read_b128 v[48:51], v32 offset:49200
-; EXACT-NEXT:    ds_read_b128 v[44:47], v32 offset:49184
-; EXACT-NEXT:    ds_read_b128 v[40:43], v32 offset:49168
-; EXACT-NEXT:    ds_read_b128 v[36:39], v32 offset:49152
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 2
-; EXACT-NEXT:    ds_write_b128 v0, v[60:63] offset:24672
-; EXACT-NEXT:    ds_write_b128 v0, v[64:67] offset:24688
-; EXACT-NEXT:    ds_write_b128 v0, v[52:55] offset:24640
-; EXACT-NEXT:    ds_write_b128 v0, v[56:59] offset:24656
-; EXACT-NEXT:    ds_write_b128 v0, v[44:47] offset:24608
-; EXACT-NEXT:    ds_write_b128 v0, v[48:51] offset:24624
-; EXACT-NEXT:    ds_write_b128 v0, v[36:39] offset:24576
-; EXACT-NEXT:    ds_write_b128 v0, v[40:43] offset:24592
-; EXACT-NEXT:    ds_read_b128 v[30:33], v1 offset:57456
-; EXACT-NEXT:    ds_read_b128 v[26:29], v1 offset:57440
-; EXACT-NEXT:    ds_read_b128 v[22:25], v1 offset:57424
-; EXACT-NEXT:    ds_read_b128 v[18:21], v1 offset:57408
-; EXACT-NEXT:    ds_read_b128 v[2:5], v1 offset:57344
-; EXACT-NEXT:    ds_read_b128 v[6:9], v1 offset:57360
-; EXACT-NEXT:    ds_read_b128 v[10:13], v1 offset:57376
-; EXACT-NEXT:    ds_read_b128 v[14:17], v1 offset:57392
-; EXACT-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACT-NEXT:    v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 7
-; EXACT-NEXT:    s_nop 2
-; EXACT-NEXT:    ds_write_b128 v0, v[26:29] offset:32864
-; EXACT-NEXT:    ds_write_b128 v0, v[30:33] offset:32880
-; EXACT-NEXT:    ds_write_b128 v0, v[18:21] offset:32832
-; EXACT-NEXT:    ds_write_b128 v0, v[22:25] offset:32848
-; EXACT-NEXT:    ds_write_b128 v0, v[10:13] offset:32800
-; EXACT-NEXT:    ds_write_b128 v0, v[14:17] offset:32816
-; EXACT-NEXT:    ds_write_b128 v0, v[2:5] offset:32768
-; EXACT-NEXT:    ds_write_b128 v0, v[6:9] offset:32784
-; EXACT-NEXT:    s_endpgm
-entry:
-  %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx
-  %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr
-  %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64
-  %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr
-  %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128
-  %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr
-  %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192
-  %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr
-  %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256
-  %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr
-  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
-  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
-  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
-  %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
-  %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx
-  store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr
-  %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64
-  store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr
-  %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128
-  store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr
-  %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192
-  store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr
-  %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256
-  store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #2
-declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
-
-attributes #0 = { nounwind "amdgpu-flat-workgroup-size"="1,256" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind readnone speculatable }

diff  --git a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir
deleted file mode 100644
index 59dbea34691e7..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/igrouplp-dag-mutation.mir
+++ /dev/null
@@ -1,292 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-igrouplp=1 -amdgpu-igrouplp-exact-solver 2>&1 | FileCheck -check-prefix=EXACT %s
-
----
-name: no_pipeline
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $sgpr0, $vgpr10_vgpr11
-    ; DEFAULT-LABEL: name: no_pipeline
-    ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; PIPELINE-LABEL: name: no_pipeline
-    ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11
-    ; PIPELINE-NEXT: {{  $}}
-    ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; EXACT-LABEL: name: no_pipeline
-    ; EXACT: liveins: $sgpr0, $vgpr10_vgpr11
-    ; EXACT-NEXT: {{  $}}
-    ; EXACT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; EXACT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; EXACT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; EXACT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    ; EXACT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; EXACT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-...
-
-
----
-name: full_pipe
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11
-    ; DEFAULT-LABEL: name: full_pipe
-    ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
-    ; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
-    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
-    ; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
-    ; DEFAULT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; DEFAULT-NEXT: }
-    ; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec
-    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
-    ; DEFAULT-NEXT:   $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: }
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
-    ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
-    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
-    ; PIPELINE-LABEL: name: full_pipe
-    ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
-    ; PIPELINE-NEXT: {{  $}}
-    ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
-    ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
-    ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
-    ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
-    ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
-    ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
-    ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
-    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
-    ; PIPELINE-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; PIPELINE-NEXT: }
-    ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
-    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
-    ; PIPELINE-NEXT:   $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; PIPELINE-NEXT: }
-    ; PIPELINE-NEXT: BUNDLE implicit $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit $vgpr3 {
-    ; PIPELINE-NEXT:   DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
-    ; PIPELINE-NEXT:   DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
-    ; PIPELINE-NEXT: }
-    ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
-    ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-LABEL: name: full_pipe
-    ; EXACT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
-    ; EXACT-NEXT: {{  $}}
-    ; EXACT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    ; EXACT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
-    ; EXACT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
-    ; EXACT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; EXACT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
-    ; EXACT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
-    ; EXACT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
-    ; EXACT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
-    ; EXACT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; EXACT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; EXACT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
-    ; EXACT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
-    ; EXACT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
-    ; EXACT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
-    ; EXACT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; EXACT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; EXACT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; EXACT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; EXACT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; EXACT-NEXT: }
-    ; EXACT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
-    ; EXACT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
-    ; EXACT-NEXT:   $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; EXACT-NEXT:   $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
-    ; EXACT-NEXT: }
-    ; EXACT-NEXT: BUNDLE implicit $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit $vgpr3 {
-    ; EXACT-NEXT:   DS_WRITE_B32 $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
-    ; EXACT-NEXT:   DS_WRITE_B32 killed $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
-    ; EXACT-NEXT: }
-    ; EXACT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
-    ; EXACT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 2, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 3, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 4, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 5, implicit $exec
-    $vgpr30 = V_MOV_B32_e32 30, implicit $exec
-    $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
-    $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 1, implicit $exec
-    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
-    $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
-    DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec
-    $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
-    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec
-...
-
----
-name: block_ends_in_bundle
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
-    ; DEFAULT-LABEL: name: block_ends_in_bundle
-    ; DEFAULT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec {
-    ; DEFAULT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; DEFAULT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec
-    ; DEFAULT-NEXT: }
-    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; PIPELINE-LABEL: name: block_ends_in_bundle
-    ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
-    ; PIPELINE-NEXT: {{  $}}
-    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec {
-    ; PIPELINE-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; PIPELINE-NEXT:   $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec
-    ; PIPELINE-NEXT: }
-    ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; EXACT-LABEL: name: block_ends_in_bundle
-    ; EXACT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3
-    ; EXACT-NEXT: {{  $}}
-    ; EXACT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec {
-    ; EXACT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-    ; EXACT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    ; EXACT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-    ; EXACT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-    ; EXACT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec
-    ; EXACT-NEXT: }
-    ; EXACT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-      $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-      BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
-        $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
-        $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-        $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
-        $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
-        $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
-    }
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
new file mode 100644
index 0000000000000..c30a35996994b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_iglp_opt() #0 {
+; GCN-LABEL: test_iglp_opt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    ; iglp_opt mask(0x00000000)
+; GCN-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 0) #1
+  ret void
+}
+
+define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* noalias %in, <32 x float> addrspace(3)* noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_mfma_gemm:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-NEXT:    ; iglp_opt mask(0x00000000)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:64
+; GCN-NEXT:    ds_read_b128 a[0:3], v1
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:16
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:32
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:8240
+; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:8224
+; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:8304
+; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:8208
+; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v1
+; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[116:119], v4 offset:57456
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:64
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:48
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
+; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[0:3]
+; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:8256
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
+; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
+; GCN-NEXT:    ds_read_b128 a[112:115], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[108:111], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[104:107], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[88:91], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[92:95], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[96:99], v4 offset:57376
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
+; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
+; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:49168
+; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:49152
+; GCN-NEXT:    ds_read_b128 a[100:103], v4 offset:57392
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119]
+; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
+; GCN-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
+; GCN-NEXT:    ds_write_b128 v0, a[44:47] offset:8240
+; GCN-NEXT:    ds_write_b128 v0, a[32:35] offset:8192
+; GCN-NEXT:    ds_write_b128 v0, a[36:39] offset:8208
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[56:87], v2, v3, a[56:87]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:32784
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    ds_write_b128 v0, a[80:83] offset:24672
+; GCN-NEXT:    ds_write_b128 v0, a[84:87] offset:24688
+; GCN-NEXT:    ds_write_b128 v0, a[72:75] offset:24640
+; GCN-NEXT:    ds_write_b128 v0, a[76:79] offset:24656
+; GCN-NEXT:    ds_write_b128 v0, a[64:67] offset:24608
+; GCN-NEXT:    ds_write_b128 v0, a[68:71] offset:24624
+; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:24576
+; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:24592
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
+; GCN-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 0)
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %in, i32 %idx
+  %load.0 = load <32 x float>, <32 x float> addrspace(3)* %load.0.addr
+  %load.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.0.addr, i32 64
+  %load.1 = load <32 x float>, <32 x float> addrspace(3)* %load.1.addr
+  %load.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.1.addr, i32 128
+  %load.2 = load <32 x float>, <32 x float> addrspace(3)* %load.2.addr
+  %load.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.2.addr, i32 192
+  %load.3 = load <32 x float>, <32 x float> addrspace(3)* %load.3.addr
+  %load.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %load.3.addr, i32 256
+  %load.4 = load <32 x float>, <32 x float> addrspace(3)* %load.4.addr
+  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
+  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
+  %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
+  %store.0.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 %idx
+  store <32 x float> %mai.0, <32 x float> addrspace(3)* %store.0.addr
+  %store.1.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 64
+  store <32 x float> %mai.1, <32 x float> addrspace(3)* %store.1.addr
+  %store.2.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 128
+  store <32 x float> %mai.2, <32 x float> addrspace(3)* %store.2.addr
+  %store.3.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 192
+  store <32 x float> %mai.3, <32 x float> addrspace(3)* %store.3.addr
+  %store.4.addr = getelementptr <32 x float>, <32 x float> addrspace(3)* %out, i32 256
+  store <32 x float> %mai.4, <32 x float> addrspace(3)* %store.4.addr
+  ret void
+}
+
+declare void @llvm.amdgcn.iglp.opt(i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { convergent nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 3aa921859b630..00d1cbbd58c59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -31,6 +31,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(<32
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -50,32 +51,37 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(<32
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
-; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
-; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
-; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
+; GCN-NEXT:    s_waitcnt vmcnt(4)
 ; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
+; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; GCN-NEXT:    v_mul_lo_u32 v18, v18, v18
 ; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_lo_u32 v23, v23, v23
 ; GCN-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v27, v27, v27
 ; GCN-NEXT:    v_mul_lo_u32 v26, v26, v26
 ; GCN-NEXT:    v_mul_lo_u32 v25, v25, v25
 ; GCN-NEXT:    v_mul_lo_u32 v24, v24, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
+; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
+; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
+; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
 ; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
 ; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
@@ -92,6 +98,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(<32
 ; EXACTCUTOFF:       ; %bb.0:
 ; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -111,32 +118,37 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(<32
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(5)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(4)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(3)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v18, v18, v18
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(2)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v23, v23, v23
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v27, v27, v27
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v26, v26, v26
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v25, v25, v25
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v24, v24, v24
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
@@ -168,13 +180,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
+; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
+; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
+; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -212,27 +230,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; GCN-NEXT:    v_mul_lo_u32 v18, v18, v18
+; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_lo_u32 v23, v23, v23
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v25, v25, v25
-; GCN-NEXT:    v_mul_lo_u32 v24, v24, v24
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
 ; GCN-NEXT:    v_mul_lo_u32 v27, v27, v27
 ; GCN-NEXT:    v_mul_lo_u32 v26, v26, v26
+; GCN-NEXT:    v_mul_lo_u32 v25, v25, v25
+; GCN-NEXT:    v_mul_lo_u32 v24, v24, v24
 ; GCN-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
-; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v29, v29, v29
-; GCN-NEXT:    v_mul_lo_u32 v28, v28, v28
-; GCN-NEXT:    v_mul_lo_u32 v31, v31, v31
-; GCN-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
 ; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
 ; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
@@ -241,6 +250,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
 ; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
@@ -248,13 +260,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF:       ; %bb.0:
 ; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
@@ -292,27 +310,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(2)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v19, v19, v19
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v18, v18, v18
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v23, v23, v23
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v25, v25, v25
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v24, v24, v24
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v27, v27, v27
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v26, v26, v26
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v25, v25, v25
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v24, v24, v24
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v22, v22, v22
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v29, v29, v29
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v28, v28, v28
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v31, v31, v31
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v30, v30, v30
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:80
@@ -321,6 +330,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:32
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:16
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
@@ -371,9 +383,21 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
+; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
+; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
+; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
@@ -388,20 +412,29 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
+; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -413,6 +446,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
@@ -420,35 +454,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
-; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
+; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -459,9 +471,21 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF:       ; %bb.0:
 ; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
@@ -476,20 +500,29 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; EXACTCUTOFF-NEXT:    s_nop 0
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
@@ -501,6 +534,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
@@ -508,35 +542,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
 ; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
-; EXACTCUTOFF-NEXT:    s_nop 0
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(2)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
 ; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
@@ -603,112 +615,105 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(<32 x
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_lshlrev_b32_e32 v99, 7, v0
-; GCN-NEXT:    v_mov_b32_e32 v96, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v97, 2.0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v98, s0, v99
-; GCN-NEXT:    ds_read_b128 v[28:31], v98 offset:112
-; GCN-NEXT:    ds_read_b128 v[24:27], v98 offset:96
-; GCN-NEXT:    ds_read_b128 v[20:23], v98 offset:80
-; GCN-NEXT:    ds_read_b128 v[16:19], v98 offset:64
-; GCN-NEXT:    ds_read_b128 v[0:3], v98
-; GCN-NEXT:    ds_read_b128 v[4:7], v98 offset:16
-; GCN-NEXT:    ds_read_b128 v[8:11], v98 offset:32
-; GCN-NEXT:    ds_read_b128 v[12:15], v98 offset:48
-; GCN-NEXT:    ds_read_b128 v[60:63], v98 offset:8304
-; GCN-NEXT:    ds_read_b128 v[56:59], v98 offset:8288
-; GCN-NEXT:    ds_read_b128 v[52:55], v98 offset:8272
-; GCN-NEXT:    ds_read_b128 v[48:51], v98 offset:8256
-; GCN-NEXT:    ds_read_b128 v[44:47], v98 offset:8240
-; GCN-NEXT:    ds_read_b128 v[40:43], v98 offset:8224
-; GCN-NEXT:    ds_read_b128 v[36:39], v98 offset:8208
-; GCN-NEXT:    ds_read_b128 v[32:35], v98 offset:8192
-; GCN-NEXT:    ds_read_b128 v[92:95], v98 offset:24688
-; GCN-NEXT:    ds_read_b128 v[88:91], v98 offset:24672
-; GCN-NEXT:    ds_read_b128 v[84:87], v98 offset:24656
-; GCN-NEXT:    ds_read_b128 v[80:83], v98 offset:24640
-; GCN-NEXT:    ds_read_b128 v[76:79], v98 offset:24624
-; GCN-NEXT:    ds_read_b128 v[72:75], v98 offset:24608
-; GCN-NEXT:    ds_read_b128 v[68:71], v98 offset:24592
-; GCN-NEXT:    ds_read_b128 v[64:67], v98 offset:24576
-; GCN-NEXT:    v_add_u32_e32 v99, s1, v99
-; GCN-NEXT:    s_waitcnt lgkmcnt(14)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31]
-; GCN-NEXT:    v_add_u32_e32 v100, 0x6000, v98
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    ds_write_b128 v99, v[28:31] offset:112
-; GCN-NEXT:    ds_write_b128 v99, v[24:27] offset:96
-; GCN-NEXT:    ds_write_b128 v99, v[20:23] offset:80
-; GCN-NEXT:    ds_write_b128 v99, v[16:19] offset:64
-; GCN-NEXT:    ds_write_b128 v99, v[12:15] offset:48
-; GCN-NEXT:    ds_write_b128 v99, v[8:11] offset:32
-; GCN-NEXT:    ds_write_b128 v99, v[4:7] offset:16
-; GCN-NEXT:    ds_write_b128 v99, v[0:3]
-; GCN-NEXT:    ds_read_b128 v[28:31], v98 offset:49264
-; GCN-NEXT:    ds_read_b128 v[24:27], v98 offset:49248
-; GCN-NEXT:    ds_read_b128 v[20:23], v98 offset:49232
-; GCN-NEXT:    ds_read_b128 v[16:19], v98 offset:49216
-; GCN-NEXT:    ds_read_b128 v[12:15], v98 offset:49200
-; GCN-NEXT:    ds_read_b128 v[8:11], v98 offset:49184
-; GCN-NEXT:    ds_read_b128 v[4:7], v98 offset:49168
-; GCN-NEXT:    ds_read_b128 v[0:3], v98 offset:49152
-; GCN-NEXT:    s_waitcnt lgkmcnt(14)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63]
-; GCN-NEXT:    v_mov_b32_e32 v98, s1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    ds_write_b128 v98, v[56:59] offset:8288
-; GCN-NEXT:    ds_write_b128 v98, v[60:63] offset:8304
-; GCN-NEXT:    ds_write_b128 v98, v[48:51] offset:8256
-; GCN-NEXT:    ds_write_b128 v98, v[52:55] offset:8272
-; GCN-NEXT:    ds_write_b128 v98, v[40:43] offset:8224
-; GCN-NEXT:    ds_write_b128 v98, v[44:47] offset:8240
-; GCN-NEXT:    ds_write_b128 v98, v[32:35] offset:8192
-; GCN-NEXT:    ds_write_b128 v98, v[36:39] offset:8208
-; GCN-NEXT:    ds_read_b128 v[60:63], v100 offset:57456
-; GCN-NEXT:    ds_read_b128 v[56:59], v100 offset:57440
-; GCN-NEXT:    ds_read_b128 v[52:55], v100 offset:57424
-; GCN-NEXT:    ds_read_b128 v[48:51], v100 offset:57408
-; GCN-NEXT:    ds_read_b128 v[32:35], v100 offset:57344
-; GCN-NEXT:    ds_read_b128 v[36:39], v100 offset:57360
-; GCN-NEXT:    ds_read_b128 v[40:43], v100 offset:57376
-; GCN-NEXT:    ds_read_b128 v[44:47], v100 offset:57392
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[64:95], v96, v97, v[64:95]
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:64
+; GCN-NEXT:    ds_read_b128 a[0:3], v1
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:16
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:32
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:8304
+; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
+; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:8272
+; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:8256
+; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:8240
+; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:8224
+; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:8208
+; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
+; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:24608
+; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
+; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
+; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:49168
+; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:49152
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
+; GCN-NEXT:    ds_read_b128 a[152:155], v2 offset:57440
+; GCN-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
+; GCN-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
+; GCN-NEXT:    ds_read_b128 a[128:131], v2 offset:57344
+; GCN-NEXT:    ds_read_b128 a[132:135], v2 offset:57360
+; GCN-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
+; GCN-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
+; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(14)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT:    s_waitcnt lgkmcnt(8)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 6
-; GCN-NEXT:    ds_write_b128 v98, v[88:91] offset:16480
-; GCN-NEXT:    ds_write_b128 v98, v[92:95] offset:16496
-; GCN-NEXT:    ds_write_b128 v98, v[80:83] offset:16448
-; GCN-NEXT:    ds_write_b128 v98, v[84:87] offset:16464
-; GCN-NEXT:    ds_write_b128 v98, v[72:75] offset:16416
-; GCN-NEXT:    ds_write_b128 v98, v[76:79] offset:16432
-; GCN-NEXT:    ds_write_b128 v98, v[64:67] offset:16384
-; GCN-NEXT:    ds_write_b128 v98, v[68:71] offset:16400
-; GCN-NEXT:    ds_write_b128 v98, v[24:27] offset:24672
-; GCN-NEXT:    ds_write_b128 v98, v[28:31] offset:24688
-; GCN-NEXT:    ds_write_b128 v98, v[16:19] offset:24640
-; GCN-NEXT:    ds_write_b128 v98, v[20:23] offset:24656
-; GCN-NEXT:    ds_write_b128 v98, v[8:11] offset:24608
-; GCN-NEXT:    ds_write_b128 v98, v[12:15] offset:24624
-; GCN-NEXT:    ds_write_b128 v98, v[0:3] offset:24576
-; GCN-NEXT:    ds_write_b128 v98, v[4:7] offset:24592
-; GCN-NEXT:    ds_write_b128 v98, v[56:59] offset:32864
-; GCN-NEXT:    ds_write_b128 v98, v[60:63] offset:32880
-; GCN-NEXT:    ds_write_b128 v98, v[48:51] offset:32832
-; GCN-NEXT:    ds_write_b128 v98, v[52:55] offset:32848
-; GCN-NEXT:    ds_write_b128 v98, v[40:43] offset:32800
-; GCN-NEXT:    ds_write_b128 v98, v[44:47] offset:32816
-; GCN-NEXT:    ds_write_b128 v98, v[32:35] offset:32768
-; GCN-NEXT:    ds_write_b128 v98, v[36:39] offset:32784
+; GCN-NEXT:    s_nop 4
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:64
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:48
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
+; GCN-NEXT:    ds_write_b128 v0, a[0:3]
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
+; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
+; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
+; GCN-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
+; GCN-NEXT:    ds_write_b128 v0, a[44:47] offset:8240
+; GCN-NEXT:    ds_write_b128 v0, a[32:35] offset:8192
+; GCN-NEXT:    ds_write_b128 v0, a[36:39] offset:8208
+; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT:    ds_write_b128 v0, a[120:123] offset:24672
+; GCN-NEXT:    ds_write_b128 v0, a[124:127] offset:24688
+; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:24640
+; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:24656
+; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:24608
+; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:24624
+; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:24576
+; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:24592
+; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[140:143] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[128:131] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:32784
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
 ; GCN-NEXT:    s_endpgm
@@ -716,112 +721,105 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(<32 x
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
 ; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v99, 7, v0
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v96, 1.0
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v97, 2.0
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v98, s0, v99
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[28:31], v98 offset:112
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[24:27], v98 offset:96
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[20:23], v98 offset:80
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[16:19], v98 offset:64
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[0:3], v98
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[4:7], v98 offset:16
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[8:11], v98 offset:32
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[12:15], v98 offset:48
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[60:63], v98 offset:8304
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[56:59], v98 offset:8288
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[52:55], v98 offset:8272
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[48:51], v98 offset:8256
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[44:47], v98 offset:8240
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[40:43], v98 offset:8224
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[36:39], v98 offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[32:35], v98 offset:8192
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[92:95], v98 offset:24688
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[88:91], v98 offset:24672
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[84:87], v98 offset:24656
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[80:83], v98 offset:24640
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[76:79], v98 offset:24624
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[72:75], v98 offset:24608
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[68:71], v98 offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[64:67], v98 offset:24576
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v99, s1, v99
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v100, 0x6000, v98
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 1
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[28:31] offset:112
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[24:27] offset:96
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[20:23] offset:80
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[16:19] offset:64
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[12:15] offset:48
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[8:11] offset:32
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[4:7] offset:16
-; EXACTCUTOFF-NEXT:    ds_write_b128 v99, v[0:3]
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[28:31], v98 offset:49264
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[24:27], v98 offset:49248
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[20:23], v98 offset:49232
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[16:19], v98 offset:49216
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[12:15], v98 offset:49200
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[8:11], v98 offset:49184
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[4:7], v98 offset:49168
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[0:3], v98 offset:49152
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63]
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v98, s1
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 1
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[56:59] offset:8288
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[60:63] offset:8304
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[48:51] offset:8256
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[52:55] offset:8272
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[40:43] offset:8224
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[44:47] offset:8240
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[32:35] offset:8192
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[36:39] offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[60:63], v100 offset:57456
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[56:59], v100 offset:57440
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[52:55], v100 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[48:51], v100 offset:57408
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[32:35], v100 offset:57344
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[36:39], v100 offset:57360
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[40:43], v100 offset:57376
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[44:47], v100 offset:57392
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[64:95], v96, v97, v[64:95]
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:112
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:96
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:80
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:64
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:16
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:32
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:48
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v1 offset:8304
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v1 offset:8272
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v1 offset:8256
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v1 offset:8240
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v1 offset:8224
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v1 offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v2, 0x6000, v1
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v1 offset:24624
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v1 offset:24608
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v1 offset:49264
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v1 offset:49248
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v1 offset:49232
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v1 offset:49216
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v1 offset:49200
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v1 offset:49184
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v1 offset:49168
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v1 offset:49152
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, 1.0
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v2 offset:57456
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v2 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v2 offset:57424
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v2 offset:57408
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v2 offset:57344
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v2 offset:57360
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v2 offset:57376
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v2 offset:57392
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v96, v97, v[0:31]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(8)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[32:63], v96, v97, v[32:63]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 6
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[88:91] offset:16480
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[92:95] offset:16496
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[80:83] offset:16448
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[84:87] offset:16464
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[72:75] offset:16416
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[76:79] offset:16432
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[64:67] offset:16384
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[68:71] offset:16400
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[24:27] offset:24672
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[28:31] offset:24688
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[16:19] offset:24640
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[20:23] offset:24656
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[8:11] offset:24608
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[12:15] offset:24624
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[0:3] offset:24576
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[4:7] offset:24592
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[56:59] offset:32864
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[60:63] offset:32880
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[48:51] offset:32832
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[52:55] offset:32848
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[40:43] offset:32800
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[44:47] offset:32816
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[32:35] offset:32768
-; EXACTCUTOFF-NEXT:    ds_write_b128 v98, v[36:39] offset:32784
+; EXACTCUTOFF-NEXT:    s_nop 4
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:96
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:80
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:64
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:48
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[44:47] offset:8240
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[32:35] offset:8192
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[36:39] offset:8208
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[88:91] offset:16480
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[92:95] offset:16496
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[80:83] offset:16448
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[84:87] offset:16464
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[72:75] offset:16416
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[76:79] offset:16432
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[64:67] offset:16384
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[68:71] offset:16400
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[120:123] offset:24672
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[124:127] offset:24688
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[112:115] offset:24640
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[116:119] offset:24656
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[104:107] offset:24608
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[108:111] offset:24624
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[96:99] offset:24576
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[100:103] offset:24592
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:32864
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:32880
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[144:147] offset:32832
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:32848
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[136:139] offset:32800
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[140:143] offset:32816
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[128:131] offset:32768
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[132:135] offset:32784
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(40) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
@@ -865,266 +863,266 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(<32
 ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_lshlrev_b32_e32 v33, 7, v0
-; GCN-NEXT:    v_mov_b32_e32 v34, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v35, 2.0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v32, s0, v33
-; GCN-NEXT:    ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT:    ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT:    ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT:    ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT:    ds_read_b128 v[0:3], v32
-; GCN-NEXT:    ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT:    ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT:    ds_read_b128 v[12:15], v32 offset:48
-; GCN-NEXT:    v_add_u32_e32 v33, s1, v33
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:64
+; GCN-NEXT:    ds_read_b128 a[0:3], v1
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:16
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:32
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v33, v[28:31] offset:112
-; GCN-NEXT:    ds_write_b128 v33, v[24:27] offset:96
-; GCN-NEXT:    ds_write_b128 v33, v[20:23] offset:80
-; GCN-NEXT:    ds_write_b128 v33, v[16:19] offset:64
-; GCN-NEXT:    ds_write_b128 v33, v[12:15] offset:48
-; GCN-NEXT:    ds_write_b128 v33, v[8:11] offset:32
-; GCN-NEXT:    ds_write_b128 v33, v[4:7] offset:16
-; GCN-NEXT:    ds_write_b128 v33, v[0:3]
-; GCN-NEXT:    ds_read_b128 v[64:67], v32 offset:8304
-; GCN-NEXT:    ds_read_b128 v[60:63], v32 offset:8288
-; GCN-NEXT:    ds_read_b128 v[56:59], v32 offset:8272
-; GCN-NEXT:    ds_read_b128 v[52:55], v32 offset:8256
-; GCN-NEXT:    ds_read_b128 v[48:51], v32 offset:8240
-; GCN-NEXT:    ds_read_b128 v[44:47], v32 offset:8224
-; GCN-NEXT:    ds_read_b128 v[40:43], v32 offset:8208
-; GCN-NEXT:    ds_read_b128 v[36:39], v32 offset:8192
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:64
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:48
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
+; GCN-NEXT:    ds_write_b128 v0, a[0:3]
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
+; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, v[60:63] offset:8288
-; GCN-NEXT:    ds_write_b128 v0, v[64:67] offset:8304
-; GCN-NEXT:    ds_write_b128 v0, v[52:55] offset:8256
-; GCN-NEXT:    ds_write_b128 v0, v[56:59] offset:8272
-; GCN-NEXT:    ds_write_b128 v0, v[44:47] offset:8224
-; GCN-NEXT:    ds_write_b128 v0, v[48:51] offset:8240
-; GCN-NEXT:    ds_write_b128 v0, v[36:39] offset:8192
-; GCN-NEXT:    ds_write_b128 v0, v[40:43] offset:8208
-; GCN-NEXT:    ds_read_b128 v[64:67], v32 offset:24688
-; GCN-NEXT:    ds_read_b128 v[60:63], v32 offset:24672
-; GCN-NEXT:    ds_read_b128 v[56:59], v32 offset:24656
-; GCN-NEXT:    ds_read_b128 v[52:55], v32 offset:24640
-; GCN-NEXT:    ds_read_b128 v[48:51], v32 offset:24624
-; GCN-NEXT:    ds_read_b128 v[44:47], v32 offset:24608
-; GCN-NEXT:    ds_read_b128 v[40:43], v32 offset:24592
-; GCN-NEXT:    ds_read_b128 v[36:39], v32 offset:24576
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
+; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, v[60:63] offset:16480
-; GCN-NEXT:    ds_write_b128 v0, v[64:67] offset:16496
-; GCN-NEXT:    ds_write_b128 v0, v[52:55] offset:16448
-; GCN-NEXT:    ds_write_b128 v0, v[56:59] offset:16464
-; GCN-NEXT:    ds_write_b128 v0, v[44:47] offset:16416
-; GCN-NEXT:    ds_write_b128 v0, v[48:51] offset:16432
-; GCN-NEXT:    ds_write_b128 v0, v[36:39] offset:16384
-; GCN-NEXT:    ds_write_b128 v0, v[40:43] offset:16400
-; GCN-NEXT:    ds_read_b128 v[64:67], v32 offset:49264
-; GCN-NEXT:    ds_read_b128 v[60:63], v32 offset:49248
-; GCN-NEXT:    ds_read_b128 v[56:59], v32 offset:49232
-; GCN-NEXT:    ds_read_b128 v[52:55], v32 offset:49216
-; GCN-NEXT:    ds_read_b128 v[48:51], v32 offset:49200
-; GCN-NEXT:    ds_read_b128 v[44:47], v32 offset:49184
-; GCN-NEXT:    ds_read_b128 v[40:43], v32 offset:49168
-; GCN-NEXT:    ds_read_b128 v[36:39], v32 offset:49152
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:49168
+; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, v[60:63] offset:24672
-; GCN-NEXT:    ds_write_b128 v0, v[64:67] offset:24688
-; GCN-NEXT:    ds_write_b128 v0, v[52:55] offset:24640
-; GCN-NEXT:    ds_write_b128 v0, v[56:59] offset:24656
-; GCN-NEXT:    ds_write_b128 v0, v[44:47] offset:24608
-; GCN-NEXT:    ds_write_b128 v0, v[48:51] offset:24624
-; GCN-NEXT:    ds_write_b128 v0, v[36:39] offset:24576
-; GCN-NEXT:    ds_write_b128 v0, v[40:43] offset:24592
-; GCN-NEXT:    ds_read_b128 v[30:33], v1 offset:57456
-; GCN-NEXT:    ds_read_b128 v[26:29], v1 offset:57440
-; GCN-NEXT:    ds_read_b128 v[22:25], v1 offset:57424
-; GCN-NEXT:    ds_read_b128 v[18:21], v1 offset:57408
-; GCN-NEXT:    ds_read_b128 v[2:5], v1 offset:57344
-; GCN-NEXT:    ds_read_b128 v[6:9], v1 offset:57360
-; GCN-NEXT:    ds_read_b128 v[10:13], v1 offset:57376
-; GCN-NEXT:    ds_read_b128 v[14:17], v1 offset:57392
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
+; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:57456
+; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:57440
+; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:57424
+; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:57408
+; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:57344
+; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:57360
+; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:57376
+; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:57392
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, v[26:29] offset:32864
-; GCN-NEXT:    ds_write_b128 v0, v[30:33] offset:32880
-; GCN-NEXT:    ds_write_b128 v0, v[18:21] offset:32832
-; GCN-NEXT:    ds_write_b128 v0, v[22:25] offset:32848
-; GCN-NEXT:    ds_write_b128 v0, v[10:13] offset:32800
-; GCN-NEXT:    ds_write_b128 v0, v[14:17] offset:32816
-; GCN-NEXT:    ds_write_b128 v0, v[2:5] offset:32768
-; GCN-NEXT:    ds_write_b128 v0, v[6:9] offset:32784
+; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
 ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; EXACTCUTOFF:       ; %bb.0: ; %entry
 ; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v33, 7, v0
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v34, 1.0
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v35, 2.0
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v3, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v32, s0, v33
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[28:31], v32 offset:112
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[24:27], v32 offset:96
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[20:23], v32 offset:80
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[16:19], v32 offset:64
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[0:3], v32
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[4:7], v32 offset:16
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[8:11], v32 offset:32
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[12:15], v32 offset:48
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v33, s1, v33
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:112
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:96
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:80
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:64
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:16
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:32
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[28:31] offset:112
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[24:27] offset:96
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[20:23] offset:80
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[16:19] offset:64
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[12:15] offset:48
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[8:11] offset:32
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[4:7] offset:16
-; EXACTCUTOFF-NEXT:    ds_write_b128 v33, v[0:3]
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[64:67], v32 offset:8304
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[60:63], v32 offset:8288
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[56:59], v32 offset:8272
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[52:55], v32 offset:8256
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[48:51], v32 offset:8240
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[44:47], v32 offset:8224
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[40:43], v32 offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[36:39], v32 offset:8192
+; EXACTCUTOFF-NEXT:    s_nop 1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:96
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:80
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:64
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:48
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, 0x6000, v32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[60:63] offset:8288
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[64:67] offset:8304
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[52:55] offset:8256
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[56:59] offset:8272
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[44:47] offset:8224
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[48:51] offset:8240
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[36:39] offset:8192
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[40:43] offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[64:67], v32 offset:24688
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[60:63], v32 offset:24672
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[56:59], v32 offset:24656
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[52:55], v32 offset:24640
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[48:51], v32 offset:24624
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[44:47], v32 offset:24608
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[40:43], v32 offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[36:39], v32 offset:24576
+; EXACTCUTOFF-NEXT:    s_nop 1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[60:63] offset:16480
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[64:67] offset:16496
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[52:55] offset:16448
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[56:59] offset:16464
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[44:47] offset:16416
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[48:51] offset:16432
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[36:39] offset:16384
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[40:43] offset:16400
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[64:67], v32 offset:49264
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[60:63], v32 offset:49248
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[56:59], v32 offset:49232
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[52:55], v32 offset:49216
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[48:51], v32 offset:49200
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[44:47], v32 offset:49184
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[40:43], v32 offset:49168
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[36:39], v32 offset:49152
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:49264
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:49248
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:49232
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:49216
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:49200
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:49184
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:49168
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, 0x6000, v1
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[36:67], v34, v35, v[36:67]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[60:63] offset:24672
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[64:67] offset:24688
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[52:55] offset:24640
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[56:59] offset:24656
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[44:47] offset:24608
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[48:51] offset:24624
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[36:39] offset:24576
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[40:43] offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[30:33], v1 offset:57456
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[26:29], v1 offset:57440
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[22:25], v1 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[18:21], v1 offset:57408
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[2:5], v1 offset:57344
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[6:9], v1 offset:57360
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[10:13], v1 offset:57376
-; EXACTCUTOFF-NEXT:    ds_read_b128 v[14:17], v1 offset:57392
+; EXACTCUTOFF-NEXT:    s_nop 1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:57456
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:57424
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:57408
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:57344
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:57360
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:57376
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:57392
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[26:29] offset:32864
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[30:33] offset:32880
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[18:21] offset:32832
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[22:25] offset:32848
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[10:13] offset:32800
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[14:17] offset:32816
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[2:5] offset:32768
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, v[6:9] offset:32784
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
 entry:
@@ -1193,6 +1191,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
 declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
 
-attributes #0 = { nounwind "amdgpu-flat-workgroup-size"="1,256" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone speculatable }

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
index 962e9947a3ba5..bf52d6f7a4ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pipeline-solver.mir
@@ -211,10 +211,10 @@ body: |
     ; GREEDY: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
     ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
@@ -299,10 +299,10 @@ body: |
     ; GREEDY-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GREEDY-NEXT: S_NOP 0
     ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; GREEDY-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
     ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GREEDY-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; GREEDY-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GREEDY-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; GREEDY-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec


        


More information about the cfe-commits mailing list