[llvm] f822db7 - [AMDGPU] Allow for MFMA Inst Clustering

via llvm-commits llvm-commits at lists.llvm.org
Tue May 10 12:59:19 PDT 2022


Author: jeff
Date: 2022-05-10T12:57:40-07:00
New Revision: f822db7670d4399bcc90830f23fdb5cec6878c73

URL: https://github.com/llvm/llvm-project/commit/f822db7670d4399bcc90830f23fdb5cec6878c73
DIFF: https://github.com/llvm/llvm-project/commit/f822db7670d4399bcc90830f23fdb5cec6878c73.diff

LOG: [AMDGPU] Allow for MFMA Inst Clustering

This patch adds cluster edges between independent MFMA instructions. Additionally, it propogates all predecessors of cluster insts to the root of the cluster(s), and all successors to the leaf(ves) of the cluster(s) -- this is done to remove the possibility that those insts will be interspersed within the cluster.

Reviewed By: kerbowa

Differential Revision: https://reviews.llvm.org/D124678

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
    llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
    llvm/test/CodeGen/AMDGPU/mfma-cluster.mir

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
new file mode 100644
index 0000000000000..5eb79ceab8978
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp
@@ -0,0 +1,175 @@
+//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering  -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to cluster MFMA
+///      instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMFMAClustering.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mfma-clustering"
+
+namespace {
+
+static cl::opt<bool> EnableMFMACluster("amdgpu-mfma-cluster",
+                                       cl::desc("Enable MFMA clustering"),
+                                       cl::init(false));
+
+static cl::opt<unsigned>
+    MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden,
+                       cl::desc("The maximum number of MFMA instructions to "
+                                "attempt to cluster together."));
+
+class MFMAClusterDAGMutation : public ScheduleDAGMutation {
+  const SIInstrInfo *TII;
+  ScheduleDAGMI *DAG;
+
+public:
+  MFMAClusterDAGMutation() = default;
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+static void collectMFMASUnits(SmallVectorImpl<SUnit *> &MFMASUnits,
+                              const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    MachineInstr &MAI = *SU.getInstr();
+    if (!TII->isMAI(MAI) ||
+        MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+        MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
+      continue;
+
+    MFMASUnits.push_back(&SU);
+
+    LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU););
+  }
+
+  // Sorting the MFMAs in NodeNum order results in a good clustering order
+  std::sort(MFMASUnits.begin(), MFMASUnits.end(),
+            [](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; });
+}
+
+static void propagateDeps(DenseMap<unsigned, unsigned> &SUnit2ClusterInfo,
+                          llvm::ArrayRef<SDep> ClusterPreds,
+                          llvm::ArrayRef<SDep> ClusterSuccs,
+                          unsigned ClusterNum, ScheduleDAGInstrs *DAG) {
+
+  for (auto Node : SUnit2ClusterInfo) {
+    if (Node.second != ClusterNum)
+      continue; // Only add the combined succs to the current cluster
+
+    LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n");
+
+    for (const SDep &Succ : ClusterSuccs) {
+      LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum
+                        << ")\n");
+      DAG->addEdge(Succ.getSUnit(),
+                   SDep(&DAG->SUnits[Node.first], SDep::Artificial));
+    }
+
+    for (const SDep &Pred : ClusterPreds) {
+      LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum
+                        << ")\n");
+      if (Pred.getSUnit()->NodeNum == ClusterNum)
+        continue;
+      DAG->addEdge(&DAG->SUnits[Node.first],
+                   SDep(Pred.getSUnit(), SDep::Artificial));
+    }
+  }
+}
+
+static void clusterNeighboringMFMAs(llvm::ArrayRef<SUnit *> MFMASUnits,
+                                    ScheduleDAGInstrs *DAG) {
+
+  DenseMap<unsigned, unsigned> SUnit2ClusterInfo;
+
+  for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) {
+    if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum))
+      continue; // We don't want to cluster against a 
diff erent cluster
+
+    auto MFMAOpa = MFMASUnits[Idx];
+    auto ClusterBase = MFMAOpa;
+    unsigned ClusterNum = ClusterBase->NodeNum;
+    SmallVector<SDep, 4> ClusterSuccs(MFMAOpa->Succs);
+    SmallVector<SDep, 4> ClusterPreds(MFMAOpa->Preds);
+    unsigned NextIdx = Idx + 1;
+    unsigned ClusterSize = 1;
+
+    // Attempt to cluster all the remaining MFMASunits in a chain
+    // starting at ClusterBase/MFMAOpa.
+    for (; NextIdx < End; ++NextIdx) {
+      if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End)
+        break;
+      // Only add independent MFMAs that have not been previously clustered
+      if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) ||
+          DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) ||
+          DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx]))
+        continue;
+
+      auto MFMAOpb = MFMASUnits[NextIdx];
+      // Aggregate the cluster inst dependencies for dep propogation
+      ClusterPreds.append(MFMAOpb->Preds);
+      ClusterSuccs.append(MFMAOpb->Succs);
+      if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)))
+        continue;
+
+      // Enforce ordering to ensure root/leaf of cluster chain gets
+      // scheduled first/last
+      DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial));
+
+      LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU("
+                        << MFMAOpb->NodeNum << ")\n");
+
+      SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum;
+      SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum;
+      ++ClusterSize;
+      MFMAOpa = MFMAOpb;
+    }
+    propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum,
+                  DAG);
+  }
+}
+
+void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  const SIMachineFunctionInfo *MFI =
+      DAGInstrs->MF.getInfo<SIMachineFunctionInfo>();
+  if (!ST.hasMAIInsts())
+    return;
+  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
+  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
+  if (!TSchedModel || DAG->SUnits.empty())
+    return;
+
+  SmallVector<SUnit *, 32> MFMASUnits;
+  collectMFMASUnits(MFMASUnits, TII, DAG);
+
+  if (MFMASUnits.size() < 2)
+    return;
+
+  clusterNeighboringMFMAs(MFMASUnits, DAG);
+}
+
+} // namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation() {
+  return EnableMFMACluster ? std::make_unique<MFMAClusterDAGMutation>()
+                           : nullptr;
+}
+
+} // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
new file mode 100644
index 0000000000000..670ff42ca30a2
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h
@@ -0,0 +1,21 @@
+//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dd0fd5fc9f984..27751f47049a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUExportClustering.h"
+#include "AMDGPUMFMAClustering.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
@@ -398,6 +399,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createMFMAClusterDAGMutation());
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -879,6 +881,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
+    DAG->addMutation(createMFMAClusterDAGMutation());
     return DAG;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 39685c33b0397..ed78923b2a44f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,6 +75,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMachineModuleInfo.cpp
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPUMFMAClustering.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
new file mode 100644
index 0000000000000..789e813252dbc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir
@@ -0,0 +1,71 @@
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering  2>&1 | FileCheck -check-prefix=PRERA %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering  2>&1 | FileCheck -check-prefix=TWOLIMIT %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s
+# REQUIRES: asserts
+
+# PRERA: Cluster MFMA SU(2) - SU(6)
+# PRERA-NEXT: Cluster MFMA SU(6) - SU(10)
+# PRERA-NEXT: Cluster MFMA SU(10) - SU(12)
+
+# TWOLIMIT: Cluster MFMA SU(2) - SU(6)
+# TWOLIMIT: Cluster MFMA SU(10) - SU(11)
+
+# POSTRA: Cluster MFMA SU(2) - SU(6)
+# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10)
+# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12)
+
+---
+name: basic_cluster
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+
+# PRERA: Cluster MFMA SU(12) - SU(16)
+# PRERA-NEXT: Cluster MFMA SU(16) - SU(20)
+
+# POSTRA: Cluster MFMA SU(12) - SU(16)
+# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20)
+
+---
+name: complex_cluster
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir
new file mode 100644
index 0000000000000..ab8c0605f52fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir
@@ -0,0 +1,354 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched  %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s
+
+
+---
+name: no_cluster
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $vgpr10_vgpr11
+    ; PRERA-LABEL: name: no_cluster
+    ; PRERA: liveins: $sgpr0, $vgpr10_vgpr11
+    ; PRERA-NEXT: {{  $}}
+    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; DEFAULT-LABEL: name: no_cluster
+    ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; BOTHSCHEDPASS-LABEL: name: no_cluster
+    ; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11
+    ; BOTHSCHEDPASS-NEXT: {{  $}}
+    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    ; TWOLIMIT-LABEL: name: no_cluster
+    ; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11
+    ; TWOLIMIT-NEXT: {{  $}}
+    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; POSTRA-LABEL: name: no_cluster
+    ; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11
+    ; POSTRA-NEXT: {{  $}}
+    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
+...
+
+
+---
+name: basic_cluster
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; PRERA-LABEL: name: basic_cluster
+    ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; PRERA-NEXT: {{  $}}
+    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-LABEL: name: basic_cluster
+    ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-LABEL: name: basic_cluster
+    ; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; BOTHSCHEDPASS-NEXT: {{  $}}
+    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-LABEL: name: basic_cluster
+    ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; TWOLIMIT-NEXT: {{  $}}
+    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-LABEL: name: basic_cluster
+    ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
+    ; POSTRA-NEXT: {{  $}}
+    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...
+
+
+---
+name: complex_cluster
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins:  $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7,  $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
+    ; PRERA-LABEL: name: complex_cluster
+    ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
+    ; PRERA-NEXT: {{  $}}
+    ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; DEFAULT-LABEL: name: complex_cluster
+    ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
+    ; DEFAULT-NEXT: {{  $}}
+    ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; BOTHSCHEDPASS-LABEL: name: complex_cluster
+    ; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11
+    ; BOTHSCHEDPASS-NEXT: {{  $}}
+    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    ; TWOLIMIT-LABEL: name: complex_cluster
+    ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
+    ; TWOLIMIT-NEXT: {{  $}}
+    ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; POSTRA-LABEL: name: complex_cluster
+    ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
+    ; POSTRA-NEXT: {{  $}}
+    ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
+    ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr9 = V_MOV_B32_e32 9, implicit $exec
+    $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
+    $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
+    $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+    $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+    $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
+    $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
+    $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
+    $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
+    $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+...


        


More information about the llvm-commits mailing list