[llvm] 2e61dfb - [AMDGPU] Instruction Type Pipeline

via llvm-commits llvm-commits at lists.llvm.org
Tue May 31 10:52:42 PDT 2022


Author: jeff
Date: 2022-05-31T17:48:52Z
New Revision: 2e61dfb1249e80a36a611c889f3ef86fa4cf3c85

URL: https://github.com/llvm/llvm-project/commit/2e61dfb1249e80a36a611c889f3ef86fa4cf3c85
DIFF: https://github.com/llvm/llvm-project/commit/2e61dfb1249e80a36a611c889f3ef86fa4cf3c85.diff

LOG: [AMDGPU] Instruction Type Pipeline

This patch implements a DAG mutation which adds edges between different groups of instructions. The purpose is to try to generate code that conforms to a pipeline (groupA instructions occur before groupB, groupB -> groupC, and so on). Currently the pipeline order is hardcoded as VMEM->DSRead->MFMA->DSWrite, but the patch was designed to be easily extensible. Alias analysis is problematic for pipelining as memory instructions will usually not be able to be reordered w.r.t one another.

Differential Revision: https://reviews.llvm.org/D125997

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h
    llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt

Removed: 
    llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
    llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
    llvm/test/CodeGen/AMDGPU/mfma-cluster.mir


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
deleted file mode 100644
index 91bf6a2f557b0..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering  -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This file contains a DAG scheduling mutation to cluster MFMA
-///      instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUMFMAClustering.h"
-#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-mfma-clustering"
-
-namespace {
-
-static cl::opt<bool> EnableMFMACluster("amdgpu-mfma-cluster",
-                                       cl::desc("Enable MFMA clustering"),
-                                       cl::init(false));
-
-static cl::opt<unsigned>
-    MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden,
-                       cl::desc("The maximum number of MFMA instructions to "
-                                "attempt to cluster together."));
-
-class MFMAClusterDAGMutation : public ScheduleDAGMutation {
-  const SIInstrInfo *TII;
-  ScheduleDAGMI *DAG;
-
-public:
-  MFMAClusterDAGMutation() = default;
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-static void collectMFMASUnits(SmallVectorImpl<SUnit *> &MFMASUnits,
-                              const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
-  for (SUnit &SU : DAG->SUnits) {
-    MachineInstr &MAI = *SU.getInstr();
-    if (!TII->isMAI(MAI) ||
-        MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
-        MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
-      continue;
-
-    MFMASUnits.push_back(&SU);
-
-    LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU););
-  }
-
-  // Sorting the MFMAs in NodeNum order results in a good clustering order
-  std::sort(MFMASUnits.begin(), MFMASUnits.end(),
-            [](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; });
-}
-
-static void propagateDeps(DenseMap<unsigned, unsigned> &SUnit2ClusterInfo,
-                          llvm::ArrayRef<SDep> ClusterPreds,
-                          llvm::ArrayRef<SDep> ClusterSuccs,
-                          unsigned ClusterNum, ScheduleDAGInstrs *DAG) {
-
-  for (auto Node : SUnit2ClusterInfo) {
-    if (Node.second != ClusterNum)
-      continue; // Only add the combined succs to the current cluster
-
-    LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n");
-
-    for (const SDep &Succ : ClusterSuccs) {
-      LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum
-                        << ")\n");
-      DAG->addEdge(Succ.getSUnit(),
-                   SDep(&DAG->SUnits[Node.first], SDep::Artificial));
-    }
-
-    for (const SDep &Pred : ClusterPreds) {
-      LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum
-                        << ")\n");
-      if (Pred.getSUnit()->NodeNum == ClusterNum)
-        continue;
-      DAG->addEdge(&DAG->SUnits[Node.first],
-                   SDep(Pred.getSUnit(), SDep::Artificial));
-    }
-  }
-}
-
-static void clusterNeighboringMFMAs(llvm::ArrayRef<SUnit *> MFMASUnits,
-                                    ScheduleDAGInstrs *DAG) {
-
-  DenseMap<unsigned, unsigned> SUnit2ClusterInfo;
-
-  for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) {
-    if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum))
-      continue; // We don't want to cluster against a 
diff erent cluster
-
-    auto MFMAOpa = MFMASUnits[Idx];
-    auto ClusterBase = MFMAOpa;
-    unsigned ClusterNum = ClusterBase->NodeNum;
-    SmallVector<SDep, 4> ClusterSuccs(MFMAOpa->Succs);
-    SmallVector<SDep, 4> ClusterPreds(MFMAOpa->Preds);
-    unsigned NextIdx = Idx + 1;
-    unsigned ClusterSize = 1;
-
-    // Attempt to cluster all the remaining MFMASunits in a chain
-    // starting at ClusterBase/MFMAOpa.
-    for (; NextIdx < End; ++NextIdx) {
-      if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End)
-        break;
-      // Only add independent MFMAs that have not been previously clustered
-      if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) ||
-          DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) ||
-          DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx]))
-        continue;
-
-      auto MFMAOpb = MFMASUnits[NextIdx];
-      // Aggregate the cluster inst dependencies for dep propogation
-      ClusterPreds.append(MFMAOpb->Preds);
-      ClusterSuccs.append(MFMAOpb->Succs);
-      if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)))
-        continue;
-
-      // Enforce ordering to ensure root/leaf of cluster chain gets
-      // scheduled first/last
-      DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial));
-
-      LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU("
-                        << MFMAOpb->NodeNum << ")\n");
-
-      SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum;
-      SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum;
-      ++ClusterSize;
-      MFMAOpa = MFMAOpb;
-    }
-    propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum,
-                  DAG);
-  }
-}
-
-void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
-  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  if (!ST.hasMAIInsts())
-    return;
-  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
-  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
-  if (!TSchedModel || DAG->SUnits.empty())
-    return;
-
-  SmallVector<SUnit *, 32> MFMASUnits;
-  collectMFMASUnits(MFMASUnits, TII, DAG);
-
-  if (MFMASUnits.size() < 2)
-    return;
-
-  clusterNeighboringMFMAs(MFMASUnits, DAG);
-}
-
-} // namespace
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation() {
-  return EnableMFMACluster ? std::make_unique<MFMAClusterDAGMutation>()
-                           : nullptr;
-}
-
-} // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
new file mode 100644
index 0000000000000..c4b1918ff10b0
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp
@@ -0,0 +1,219 @@
+//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP  ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file contains a DAG scheduling mutation which tries to coerce
+//       the scheduler into generating an ordering based on ordering of groups
+//       of instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMFMAIGroupLP.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP"
+
+namespace {
+
+static cl::opt<bool>
+    EnableMFMAIGroupLP("amdgpu-mfma-igrouplp",
+                       cl::desc("Enable construction of Instruction Groups and "
+                                "their ordering for scheduling"),
+                       cl::init(false));
+
+static cl::opt<int>
+    VMEMGroupMaxSize("amdgpu-mfma-igrouplp-vmem-group-size", cl::init(-1),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in VMEM group."));
+
+static cl::opt<int>
+    MFMAGroupMaxSize("amdgpu-mfma-igrouplp-mfma-group-size", cl::init(-1),
+                     cl::Hidden,
+                     cl::desc("The maximum number of instructions to include "
+                              "in MFMA group."));
+
+static cl::opt<int>
+    LDRGroupMaxSize("amdgpu-mfma-igrouplp-ldr-group-size", cl::init(-1),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds read group."));
+
+static cl::opt<int>
+    LDWGroupMaxSize("amdgpu-mfma-igrouplp-ldw-group-size", cl::init(-1),
+                    cl::Hidden,
+                    cl::desc("The maximum number of instructions to include "
+                             "in lds/gds write group."));
+
+typedef function_ref<bool(const MachineInstr &)> IsInstructionType;
+
+struct InstructionClass {
+  SmallVector<SUnit *, 32> Collection;
+  const IsInstructionType isInstructionClass;
+  // MaxSize is initialized to -1 by default, if MaxSize is < 0, then
+  // the collection will not have a size limit
+  const int MaxSize;
+
+  InstructionClass(IsInstructionType IsInstructionClass, int maxSize)
+      : isInstructionClass(IsInstructionClass), MaxSize(maxSize){};
+
+  bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; }
+};
+
+class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation {
+public:
+  const SIInstrInfo *TII;
+  ScheduleDAGMI *DAG;
+
+  MFMAIGroupLPDAGMutation() = default;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+static void collectSUnits(SmallVectorImpl<InstructionClass *> &PipelineOrder,
+                          const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
+
+    // Presently, a bundle only counts as one instruction towards
+    // the group's maximum size
+    if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) {
+      MachineInstr *MI = SU.getInstr();
+      MachineBasicBlock::instr_iterator BundledMI = MI->getIterator();
+      ++BundledMI;
+
+      LLVM_DEBUG(dbgs() << "Checking bundled insts\n";);
+
+      InstructionClass *MatchingStage = nullptr;
+      for (auto Stage : PipelineOrder) {
+        if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) {
+          MatchingStage = Stage;
+          break;
+        }
+      }
+
+      if (MatchingStage != nullptr) {
+        while (MatchingStage->isInstructionClass(*BundledMI)) {
+          if (!BundledMI->isBundledWithSucc())
+            break;
+          ++BundledMI;
+        }
+
+        if (!BundledMI->isBundledWithSucc()) {
+          LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";);
+          MatchingStage->Collection.push_back(&SU);
+        }
+      }
+    }
+
+    for (InstructionClass *Stage : PipelineOrder) {
+      if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) {
+        Stage->Collection.push_back(&SU);
+      }
+    }
+  }
+}
+
+static void
+addPipelineEdges(const llvm::ArrayRef<InstructionClass *> PipelineOrder,
+                 ScheduleDAGInstrs *DAG) {
+  for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) {
+    auto StageA = PipelineOrder[i];
+    for (int j = i + 1; j < (int)PipelineOrder.size(); j++) {
+      auto StageB = PipelineOrder[j];
+      for (auto SUnitA : StageA->Collection) {
+        LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA););
+        for (auto SUnitB : StageB->Collection) {
+          if (DAG->canAddEdge(SUnitB, SUnitA)) {
+            DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial));
+            LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB););
+          } else {
+            LLVM_DEBUG(dbgs() << "Can't add edge to: ";
+                       DAG->dumpNode(*SUnitB););
+          }
+        }
+      }
+    }
+  }
+}
+
+void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  if (!ST.hasMAIInsts())
+    return;
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAG->SUnits.empty())
+    return;
+
+  const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) {
+    if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+        MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) {
+      LLVM_DEBUG(dbgs() << "Found MFMA\n";);
+      return true;
+    }
+    return false;
+  };
+  InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize);
+
+  const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) {
+    if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) &&
+        MI.mayLoad()) {
+      LLVM_DEBUG(dbgs() << "Found VMEM read\n";);
+      return true;
+    }
+    return false;
+  };
+  InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize);
+
+  const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) {
+    if (TII->isDS(MI) && MI.mayStore()) {
+      LLVM_DEBUG(dbgs() << "Found DS Write\n";);
+      return true;
+    }
+    return false;
+  };
+  InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize);
+
+  const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) {
+    if (TII->isDS(MI) && MI.mayLoad()) {
+      LLVM_DEBUG(dbgs() << "Found DS Read\n";);
+      return true;
+    }
+    return false;
+  };
+  InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize);
+
+  // The order of InstructionClasses in this vector defines the
+  // order in which edges will be added. In other words, given the
+  // present ordering, we will try to make each VMEMRead instruction
+  // a predecessor of each DSRead instruction, and so on.
+  SmallVector<InstructionClass *, 4> PipelineOrder = {
+      &VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits};
+
+  collectSUnits(PipelineOrder, TII, DAG);
+
+  addPipelineEdges(PipelineOrder, DAG);
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation() {
+  return EnableMFMAIGroupLP ? std::make_unique<MFMAIGroupLPDAGMutation>()
+                            : nullptr;
+}
+
+} // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h
similarity index 56%
rename from llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
rename to llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h
index 670ff42ca30a2..7a830934e5043 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h
@@ -1,4 +1,4 @@
-//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===//
+//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
 
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include <memory>
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createMFMAIGroupLPDAGMutation();
 
 } // namespace llvm
 
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 27751f47049a2..7291d5d65e09f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,7 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUExportClustering.h"
-#include "AMDGPUMFMAClustering.h"
+#include "AMDGPUMFMAIGroupLP.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
@@ -399,7 +399,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createMFMAClusterDAGMutation());
+  DAG->addMutation(createMFMAIGroupLPDAGMutation());
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -881,7 +881,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createMFMAClusterDAGMutation());
+    DAG->addMutation(createMFMAIGroupLPDAGMutation());
     return DAG;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ed78923b2a44f..267d3e686ad08 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,7 +75,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMachineModuleInfo.cpp
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
-  AMDGPUMFMAClustering.cpp
+  AMDGPUMFMAIGroupLP.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
deleted file mode 100644
index 789e813252dbc..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
+++ /dev/null
@@ -1,71 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering  2>&1 | FileCheck -check-prefix=PRERA %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering  2>&1 | FileCheck -check-prefix=TWOLIMIT %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s
-# REQUIRES: asserts
-
-# PRERA: Cluster MFMA SU(2) - SU(6)
-# PRERA-NEXT: Cluster MFMA SU(6) - SU(10)
-# PRERA-NEXT: Cluster MFMA SU(10) - SU(12)
-
-# TWOLIMIT: Cluster MFMA SU(2) - SU(6)
-# TWOLIMIT: Cluster MFMA SU(10) - SU(11)
-
-# POSTRA: Cluster MFMA SU(2) - SU(6)
-# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10)
-# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12)
-
----
-name: basic_cluster
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-...
-
-# PRERA: Cluster MFMA SU(12) - SU(16)
-# PRERA-NEXT: Cluster MFMA SU(16) - SU(20)
-
-# POSTRA: Cluster MFMA SU(12) - SU(16)
-# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20)
-
----
-name: complex_cluster
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir
deleted file mode 100644
index ab8c0605f52fd..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir
+++ /dev/null
@@ -1,354 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched  %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s
-
-
----
-name: no_cluster
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $sgpr0, $vgpr10_vgpr11
-    ; PRERA-LABEL: name: no_cluster
-    ; PRERA: liveins: $sgpr0, $vgpr10_vgpr11
-    ; PRERA-NEXT: {{  $}}
-    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; DEFAULT-LABEL: name: no_cluster
-    ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; BOTHSCHEDPASS-LABEL: name: no_cluster
-    ; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11
-    ; BOTHSCHEDPASS-NEXT: {{  $}}
-    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    ; TWOLIMIT-LABEL: name: no_cluster
-    ; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11
-    ; TWOLIMIT-NEXT: {{  $}}
-    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; POSTRA-LABEL: name: no_cluster
-    ; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11
-    ; POSTRA-NEXT: {{  $}}
-    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
-...
-
-
----
-name: basic_cluster
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; PRERA-LABEL: name: basic_cluster
-    ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; PRERA-NEXT: {{  $}}
-    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-LABEL: name: basic_cluster
-    ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-LABEL: name: basic_cluster
-    ; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; BOTHSCHEDPASS-NEXT: {{  $}}
-    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-LABEL: name: basic_cluster
-    ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; TWOLIMIT-NEXT: {{  $}}
-    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-LABEL: name: basic_cluster
-    ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
-    ; POSTRA-NEXT: {{  $}}
-    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-...
-
-
----
-name: complex_cluster
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
-    ; PRERA-LABEL: name: complex_cluster
-    ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
-    ; PRERA-NEXT: {{  $}}
-    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; DEFAULT-LABEL: name: complex_cluster
-    ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
-    ; DEFAULT-NEXT: {{  $}}
-    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; BOTHSCHEDPASS-LABEL: name: complex_cluster
-    ; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11
-    ; BOTHSCHEDPASS-NEXT: {{  $}}
-    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    ; TWOLIMIT-LABEL: name: complex_cluster
-    ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
-    ; TWOLIMIT-NEXT: {{  $}}
-    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; POSTRA-LABEL: name: complex_cluster
-    ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
-    ; POSTRA-NEXT: {{  $}}
-    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
-    ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
-    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
-    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
-    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
-    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
-    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
-    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
-    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
-    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir
new file mode 100644
index 0000000000000..f7139cc3c7bf7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir
@@ -0,0 +1,183 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s
+
+---
+name: no_pipeline
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $vgpr10_vgpr11
+    ; DEFAULT-LABEL: name: no_pipeline
+    ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; PIPELINE-LABEL: name: no_pipeline
+    ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11
+    ; PIPELINE-NEXT: {{  $}}
+    ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+...
+
+
+---
+name: full_pipe
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11
+    ; DEFAULT-LABEL: name: full_pipe
+    ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
+    ; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
+    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
+    ; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
+    ; DEFAULT-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; DEFAULT-NEXT: }
+    ; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec
+    ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
+    ; DEFAULT-NEXT:   $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
+    ; DEFAULT-NEXT:   $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: }
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
+    ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
+    ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
+    ; PIPELINE-LABEL: name: full_pipe
+    ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11
+    ; PIPELINE-NEXT: {{  $}}
+    ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec
+    ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+    ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec
+    ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec
+    ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec
+    ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec
+    ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec
+    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec {
+    ; PIPELINE-NEXT:   $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    ; PIPELINE-NEXT: }
+    ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
+    ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec {
+    ; PIPELINE-NEXT:   $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
+    ; PIPELINE-NEXT:   $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec
+    ; PIPELINE-NEXT: }
+    ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 {
+    ; PIPELINE-NEXT:   DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec
+    ; PIPELINE-NEXT:   DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec
+    ; PIPELINE-NEXT: }
+    ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 2, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 4, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 5, implicit $exec
+    $vgpr30 = V_MOV_B32_e32 30, implicit $exec
+    $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec
+    $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr24 = V_MOV_B32_e32 1, implicit $exec
+    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec
+    $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec
+    $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec
+    DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec
+    $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
+    $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr18 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec
+    DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec
+    $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec
+    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec
+...


        


More information about the llvm-commits mailing list