[llvm] [AMDGPU] Introduce iglp_opt(2): Generalized exp/mfma interleaving for select kernels (PR #81342)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 22 13:18:25 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/81342
>From 1f4d9b3895cbebe5651b04817c2c94dfc9bb164f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 1 Feb 2024 13:57:13 -0800
Subject: [PATCH 1/3] [AMDGPU] Introduce IGLPPhase
Change-Id: I24170c286115863eebbf71973d8e86f4dea90c49
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 34 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 6 +++-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 ++--
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 7 ++--
4 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13e..2aaaf6685e3543 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -852,7 +852,7 @@ class IGLPStrategy {
virtual void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) = 0;
+ IGLPPhase Phase) = 0;
// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -871,7 +871,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) override;
+ IGLPPhase Phase) override;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
@@ -884,7 +884,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
void MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) {
+ IGLPPhase Phase) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)
@@ -1101,7 +1101,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) override;
+ IGLPPhase Phase) override;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
@@ -1118,13 +1118,14 @@ static unsigned DSWWithSharedVMEMCount = 0;
void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) {
+ IGLPPhase Phase) {
unsigned MFMACount = 0;
unsigned DSRCount = 0;
- assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
- DSWWithSharedVMEMCount == 0)) &&
- "DSWCounters should be zero in pre-RA scheduling!");
+ assert(
+ (Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+ DSWWithSharedVMEMCount == 0)) &&
+ "DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto I = SU.getInstr();
@@ -1133,7 +1134,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
- else if (I->mayStore() && !IsReentry) {
+ else if (I->mayStore() && Phase == IGLPPhase::Initial) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1146,7 +1147,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}
- if (!IsReentry) {
+ if (Phase == IGLPPhase::Initial) {
DSWWithPermCount = DSWithPerms.size();
auto I = DSWithPerms.begin();
auto E = DSWithPerms.end();
@@ -1414,10 +1415,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
bool IsBottomUp = 1;
// Whether or not this is a reentry into the IGroupLPDAGMutation.
- bool IsReentry = false;
+ IGLPPhase Phase = IGLPPhase::Initial;
IGroupLPDAGMutation() = default;
- IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
+ IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {}
};
unsigned SchedGroup::NumSchedGroups = 0;
@@ -1717,7 +1718,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
auto S = createIGLPStrategy(StrategyID, DAG, TII);
if (S->shouldApplyStrategy(DAG)) {
IsBottomUp = S->IsBottomUp;
- S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
+ S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
}
}
@@ -1725,13 +1726,14 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
namespace llvm {
-/// \p IsReentry specifes whether or not this is a reentry into the
+/// \p Phase specifes whether or not this is a reentry into the
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
/// scheduling "phases"), we can reenter this mutation framework more than once
/// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
- return std::make_unique<IGroupLPDAGMutation>(IsReentry);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(IGLPPhase Phase) {
+ return std::make_unique<IGroupLPDAGMutation>(Phase);
}
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 3ec8be4f889205..dec723fbf1fb83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,11 @@
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry);
+// Components of the mask that determines which instruction types may be may be
+// classified into a SchedGroup.
+enum class IGLPPhase { Initial = 0u, PreRAReentry = 1u << 0, PostRA = 1u << 1 };
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e26b4cf820a52b..51090891a38891 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -471,7 +471,7 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
- DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
return DAG;
}
@@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
- DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA));
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4081115aa68cad..d8e28c9ec9409c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+ DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry));
InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -855,7 +855,8 @@ bool GCNSchedStage::initGCNRegion() {
SavedMutations.swap(DAG.Mutations);
bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule;
- DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
+ DAG.addMutation(createIGroupLPDAGMutation(
+ IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
}
return true;
@@ -1569,7 +1570,7 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
- addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
+ addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA));
}
ScheduleDAGMI::schedule();
>From 06bf083312b23441f36f6be57fabd14bbc9039f3 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 13 Feb 2024 16:28:58 -0800
Subject: [PATCH 2/3] [AMDGPU] Fall back to SavedMutations when not applying
IGLP
Change-Id: I4ef67afa5e68714231a01afb1e9f58dbddd07368
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 67 ++++++++++++-------
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 5 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++-
4 files changed, 56 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 2aaaf6685e3543..94b0cce7b5cc94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -849,7 +849,7 @@ class IGLPStrategy {
public:
/// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
- virtual void applyIGLPStrategy(
+ virtual bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
IGLPPhase Phase) = 0;
@@ -868,7 +868,7 @@ class IGLPStrategy {
class MFMASmallGemmOpt final : public IGLPStrategy {
private:
public:
- void applyIGLPStrategy(
+ bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
IGLPPhase Phase) override;
@@ -881,7 +881,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
}
};
-void MFMASmallGemmOpt::applyIGLPStrategy(
+bool MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
IGLPPhase Phase) {
@@ -902,6 +902,8 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}
+
+ return true;
}
class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
@@ -1098,7 +1100,7 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
};
public:
- void applyIGLPStrategy(
+ bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
IGLPPhase Phase) override;
@@ -1115,7 +1117,7 @@ static unsigned DSWCount = 0;
static unsigned DSWWithPermCount = 0;
static unsigned DSWWithSharedVMEMCount = 0;
-void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
+bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
IGLPPhase Phase) {
@@ -1355,6 +1357,8 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}
+
+ return true;
}
static std::unique_ptr<IGLPStrategy>
@@ -1376,6 +1380,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
ScheduleDAGMI *DAG;
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
+
// Organize lists of SchedGroups by their SyncID. SchedGroups /
// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
// between then.
@@ -1402,7 +1408,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
void initSchedGroupBarrierPipelineStage(
std::vector<SUnit>::reverse_iterator RIter);
- void initIGLPOpt(SUnit &SU);
+ bool initIGLPOpt(SUnit &SU);
public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
@@ -1418,7 +1424,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
IGLPPhase Phase = IGLPPhase::Initial;
IGroupLPDAGMutation() = default;
- IGroupLPDAGMutation(IGLPPhase Phase) : Phase(Phase) {}
+ IGroupLPDAGMutation(
+ IGLPPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
+ : SavedMutations(SavedMutations), Phase(Phase) {}
};
unsigned SchedGroup::NumSchedGroups = 0;
@@ -1609,31 +1618,41 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
SyncedSchedGroups.clear();
SyncedInstrs.clear();
- bool foundSB = false;
- bool foundIGLP = false;
+ bool FoundSB = false;
+ bool FoundIGLP = false;
+ bool ShouldApplyIGLP = false;
for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
unsigned Opc = R->getInstr()->getOpcode();
// SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
if (Opc == AMDGPU::SCHED_BARRIER) {
addSchedBarrierEdges(*R);
- foundSB = true;
+ FoundSB = true;
} else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
initSchedGroupBarrierPipelineStage(R);
- foundSB = true;
+ FoundSB = true;
} else if (Opc == AMDGPU::IGLP_OPT) {
resetEdges(*R, DAG);
- if (!foundSB && !foundIGLP)
- initIGLPOpt(*R);
- foundIGLP = true;
+ if (!FoundSB && !FoundIGLP) {
+ FoundIGLP = true;
+ ShouldApplyIGLP = initIGLPOpt(*R);
+ }
}
}
- if (foundSB || foundIGLP) {
+ if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it
// determined as the best
PS.solve();
+ return;
}
+
+ if (!SavedMutations)
+ return;
+
+ // We did not apply a mutation, fall back to SavedMutations
+ for (auto &m : *SavedMutations)
+ m->apply(DAG);
}
void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -1712,14 +1731,15 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
}
-void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
+bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
auto S = createIGLPStrategy(StrategyID, DAG, TII);
- if (S->shouldApplyStrategy(DAG)) {
- IsBottomUp = S->IsBottomUp;
- S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
- }
+ if (!S->shouldApplyStrategy(DAG))
+ return false;
+
+ IsBottomUp = S->IsBottomUp;
+ return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
}
} // namespace
@@ -1731,9 +1751,10 @@ namespace llvm {
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
/// scheduling "phases"), we can reenter this mutation framework more than once
/// for a given region.
-std::unique_ptr<ScheduleDAGMutation>
-createIGroupLPDAGMutation(IGLPPhase Phase) {
- return std::make_unique<IGroupLPDAGMutation>(Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+ IGLPPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
+ return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
}
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index dec723fbf1fb83..ec48430ba3684b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,6 +11,7 @@
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
+#include <vector>
namespace llvm {
@@ -18,7 +19,9 @@ namespace llvm {
// classified into a SchedGroup.
enum class IGLPPhase { Initial = 0u, PreRAReentry = 1u << 0, PostRA = 1u << 1 };
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(IGLPPhase Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+ IGLPPhase Phase,
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 51090891a38891..9ba12171396f51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -471,7 +471,7 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr));
return DAG;
}
@@ -934,7 +934,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA));
+ DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA, nullptr));
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d8e28c9ec9409c..8ce5a4cb58586d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry));
+ DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry, nullptr));
InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -856,7 +856,8 @@ bool GCNSchedStage::initGCNRegion() {
bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule;
DAG.addMutation(createIGroupLPDAGMutation(
- IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry));
+ IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry,
+ &SavedMutations));
}
return true;
@@ -1570,7 +1571,8 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
- addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA));
+ addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA,
+ &SavedMutations));
}
ScheduleDAGMI::schedule();
>From 7eb410cc67950a84b367a4f12c14a3eec17d3eb9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 9 Feb 2024 16:04:14 -0800
Subject: [PATCH 3/3] [AMDGPU] Introduce iglp_opt(2): Generalized exp/mfma
interleaving for select kernels
Change-Id: Iccb8a7bc6ed44f5d5e968b04b24ccfa7194d884d
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 985 +++++++-
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 9 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 +-
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 2055 +++++++++++++++++
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 901 ++++++++
6 files changed, 3925 insertions(+), 42 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 94b0cce7b5cc94..d0ed682c1b4027 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;
- const SIInstrInfo *TII;
-
// Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit *A, SUnit *B);
@@ -154,6 +152,7 @@ class SchedGroup {
SmallVector<SUnit *, 32> Collection;
ScheduleDAGInstrs *DAG;
+ const SIInstrInfo *TII;
// Returns true if SU can be added to this SchedGroup.
bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+ : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
SGID = NumSchedGroups++;
}
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+ : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
SGID = NumSchedGroups++;
}
};
@@ -838,6 +837,7 @@ void PipelineSolver::solve() {
enum IGLPStrategyID : int {
MFMASmallGemmOptID = 0,
MFMASmallGemmSingleWaveOptID = 1,
+ MFMAExpInterleave = 2
};
// Implement a IGLP scheduling strategy.
@@ -852,10 +852,11 @@ class IGLPStrategy {
virtual bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- IGLPPhase Phase) = 0;
+ AMDGPU::SchedulingPhase Phase) = 0;
// Returns true if this strategy should be applied to a ScheduleDAG.
- virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
+ virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) = 0;
bool IsBottomUp = 1;
@@ -871,9 +872,12 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- IGLPPhase Phase) override;
+ AMDGPU::SchedulingPhase Phase) override;
- bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override {
+ return true;
+ }
MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
: IGLPStrategy(DAG, TII) {
@@ -884,7 +888,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
bool MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- IGLPPhase Phase) {
+ AMDGPU::SchedulingPhase Phase) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)
@@ -906,6 +910,918 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
return true;
}
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+ // The count of TRANS SUs involved in the interleaved pipeline
+ static unsigned TransPipeCount;
+ // The count of MFMA SUs involved in the interleaved pipeline
+ static unsigned MFMAPipeCount;
+ // The number of transitive MFMA successors for each TRANS SU
+ static unsigned MFMAEnablement;
+ // The number of transitive TRANS predecessors for each MFMA SU
+ static unsigned ExpRequirement;
+ // The count of independent "chains" of MFMA instructions in the pipeline
+ static unsigned MFMAChains;
+ // The length of each independent "chain" of MFMA instructions
+ static unsigned MFMAChainLength;
+ // Whether or not the pipeline has V_CVT instructions
+ static bool HasCvt;
+ // Whether or not there are instructions between the TRANS instruction and
+ // V_CVT
+ static bool HasChainBetweenCvt;
+ // The first occuring DS_READ which feeds an MFMA chain
+ static std::optional<unsigned> FirstPipeDSR;
+ SmallVector<SUnit *, 4> MFMAChainSeeds;
+ // Compute the heuristics for the pipeline, returning whether or not the DAG
+ // is well formatted for the mutation
+ bool analyzeDAG(const SIInstrInfo *TII);
+
+ /// Whether or not the instruction is a transitive predecessor of an MFMA
+ /// instruction
+ class IsPipeExp final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ auto DAG = SyncPipe[0].DAG;
+
+ if (Cache->empty()) {
+ auto I = DAG->SUnits.rbegin();
+ auto E = DAG->SUnits.rend();
+ for (; I != E; I++) {
+ if (TII->isMFMAorWMMA(*I->getInstr()))
+ Cache->push_back(&*I);
+ }
+ if (Cache->empty())
+ return false;
+ }
+
+ auto Reaches = (std::any_of(
+ Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+ return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+ }));
+
+ return Reaches;
+ }
+ IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the instruction is a transitive predecessor of the
+ /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
+ class EnablesNthMFMA final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ bool FoundTrans = false;
+ unsigned Counter = 1;
+ auto DAG = SyncPipe[0].DAG;
+
+ if (Cache->empty()) {
+ SmallVector<SUnit *, 8> Worklist;
+
+ auto I = DAG->SUnits.begin();
+ auto E = DAG->SUnits.end();
+ for (; I != E; I++) {
+ if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
+ if (Counter == Number) {
+ Cache->push_back(&*I);
+ break;
+ }
+ ++Counter;
+ }
+ if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
+ FoundTrans = true;
+ }
+ if (Cache->empty())
+ return false;
+ }
+
+ return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+ }
+
+ EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+ };
+
+ /// Whether or not the instruction enables the exact MFMA that is the \p
+ /// Number th MFMA in the chain starting with \p ChainSeed
+ class EnablesNthMFMAInChain final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+ SUnit *ChainSeed;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto DAG = SyncPipe[0].DAG;
+
+ if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+ return false;
+
+ if (Cache->empty()) {
+ auto TempSU = ChainSeed;
+ auto Depth = Number;
+ while (Depth > 0) {
+ --Depth;
+ bool Found = false;
+ for (auto &Succ : TempSU->Succs) {
+ if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+ TempSU = Succ.getSUnit();
+ Found = true;
+ break;
+ }
+ }
+ if (!Found)
+ return false;
+ }
+
+ Cache->push_back(TempSU);
+ }
+ // If we failed to find the instruction to be placed into the cache, we
+ // would have already exited.
+ assert(!Cache->empty());
+
+ return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+ }
+
+ EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
+ const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+ ChainSeed(ChainSeed) {}
+ };
+
+ /// Whether or not the instruction has less than \p Size immediate successors.
+ /// If \p HasIntermediary is true, this tests also whether all successors of
+ /// the SUnit have less than \p Size successors.
+ class LessThanNSuccs final : public InstructionRule {
+ private:
+ unsigned Size = 1;
+ bool HasIntermediary = false;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SyncPipe.size())
+ return false;
+
+ auto SuccSize = std::count_if(
+ SU->Succs.begin(), SU->Succs.end(),
+ [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+ if (SuccSize >= Size)
+ return false;
+
+ if (HasIntermediary) {
+ for (auto Succ : SU->Succs) {
+ auto SuccSize = std::count_if(
+ Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+ [](const SDep &SuccSucc) {
+ return SuccSucc.getKind() == SDep::Data;
+ });
+ if (SuccSize >= Size)
+ return false;
+ }
+ }
+
+ return true;
+ }
+ LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+ bool HasIntermediary = false, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+ HasIntermediary(HasIntermediary) {}
+ };
+
+ /// Whether or not the instruction has greater than or equal to \p Size
+ /// immediate successors. If \p HasIntermediary is true, this tests also
+ /// whether all successors of the SUnit have greater than or equal to \p Size
+ /// successors.
+ class GreaterThanOrEqualToNSuccs final : public InstructionRule {
+ private:
+ unsigned Size = 1;
+ bool HasIntermediary = false;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SyncPipe.size())
+ return false;
+
+ auto SuccSize = std::count_if(
+ SU->Succs.begin(), SU->Succs.end(),
+ [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+ if (SuccSize >= Size)
+ return true;
+
+ if (HasIntermediary) {
+ for (auto Succ : SU->Succs) {
+ auto SuccSize = std::count_if(
+ Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+ [](const SDep &SuccSucc) {
+ return SuccSucc.getKind() == SDep::Data;
+ });
+ if (SuccSize >= Size)
+ return true;
+ }
+ }
+
+ return false;
+ }
+ GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
+ unsigned SGID, bool HasIntermediary = false,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+ HasIntermediary(HasIntermediary) {}
+ };
+
+ // Whether or not the instruction is a relevant V_CVT instruction.
+ class IsCvt final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto Opc = SU->getInstr()->getOpcode();
+ return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+ Opc == AMDGPU::V_CVT_I32_F32_e32;
+ }
+ IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether or not the instruction is V_FMA_F32.
+ class IsFMA final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+ SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
+ }
+ IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the instruction is an immediate RAW successor
+ /// of the SchedGroup \p Distance steps before.
+ class IsSuccOfPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+ OtherGroup = &PipeSG;
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ for (auto &OtherEle : OtherGroup->Collection) {
+ for (auto &Succ : OtherEle->Succs) {
+ if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+ return true;
+ }
+ }
+
+ return false;
+ }
+ IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction is a transitive successor of any
+ /// instruction the the SchedGroup \p Distance steps before.
+ class IsReachableFromPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+ OtherGroup = &PipeSG;
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ auto DAG = SyncPipe[0].DAG;
+
+ for (auto &OtherEle : OtherGroup->Collection)
+ if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+ return true;
+
+ return false;
+ }
+ IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
+ class OccursAtOrAfterNode final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ return SU->NodeNum >= Number;
+ }
+ OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+ };
+
+ /// Whether or not the SU is exactly the \p Number th MFMA in the chain
+ /// starting with \p ChainSeed
+ class IsExactMFMA final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+ SUnit *ChainSeed;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+ return false;
+
+ if (Cache->empty()) {
+ auto TempSU = ChainSeed;
+ auto Depth = Number;
+ while (Depth > 0) {
+ --Depth;
+ bool Found = false;
+ for (auto &Succ : TempSU->Succs) {
+ if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+ TempSU = Succ.getSUnit();
+ Found = true;
+ break;
+ }
+ }
+ if (!Found) {
+ return false;
+ }
+ }
+ Cache->push_back(TempSU);
+ }
+ // If we failed to find the instruction to be placed into the cache, we
+ // would have already exited.
+ assert(!Cache->empty());
+
+ return (*Cache)[0] == SU;
+ }
+
+ IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+ ChainSeed(ChainSeed) {}
+ };
+
+ // Whether the instruction occurs after the first TRANS instruction. This
+ // implies the instruction can not be a predecessor of the first TRANS
+ // insruction
+ class OccursAfterExp final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ SmallVector<SUnit *, 12> Worklist;
+ auto DAG = SyncPipe[0].DAG;
+ if (Cache->empty()) {
+ for (auto &SU : DAG->SUnits)
+ if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+ Cache->push_back(&SU);
+ break;
+ }
+ if (Cache->empty())
+ return false;
+ }
+
+ return SU->NodeNum > (*Cache)[0]->NodeNum;
+ }
+
+ OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+public:
+ bool applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override;
+
+ MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = 0;
+ }
+};
+
+unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
+unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
+unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
+unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
+unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
+unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
+bool MFMAExpInterleaveOpt::HasCvt = false;
+bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
+std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
+
+bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
+ SmallVector<SUnit *, 10> ExpPipeCands;
+ SmallVector<SUnit *, 10> MFMAPipeCands;
+ SmallVector<SUnit *, 10> MFMAPipeSUs;
+ SmallVector<SUnit *, 10> PackSUs;
+ SmallVector<SUnit *, 10> CvtSUs;
+
+ auto isBitPack = [](unsigned Opc) {
+ return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
+ };
+
+ auto isCvt = [](unsigned Opc) {
+ return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
+ };
+
+ for (SUnit &SU : DAG->SUnits) {
+ auto Opc = SU.getInstr()->getOpcode();
+ if (TII->isTRANS(Opc)) {
+ // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+ if (SU.Succs.size() >= 7)
+ continue;
+ for (auto &Succ : SU.Succs) {
+ if (Succ.getSUnit()->Succs.size() >= 7)
+ continue;
+ }
+ ExpPipeCands.push_back(&SU);
+ }
+
+ if (TII->isMFMAorWMMA(*SU.getInstr()))
+ MFMAPipeCands.push_back(&SU);
+
+ if (isBitPack(Opc))
+ PackSUs.push_back(&SU);
+
+ if (isCvt(Opc))
+ CvtSUs.push_back(&SU);
+ }
+
+ if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+ return false;
+
+ TransPipeCount = 0;
+
+ std::optional<SUnit *> TempMFMA;
+ std::optional<SUnit *> TempExp;
+ // Count the number of EXPs that reach an MFMA
+ for (auto &PredSU : ExpPipeCands) {
+ for (auto &SuccSU : MFMAPipeCands) {
+ if (DAG->IsReachable(SuccSU, PredSU)) {
+ if (!TempExp) {
+ TempExp = PredSU;
+ TempMFMA = SuccSU;
+ }
+ MFMAPipeSUs.push_back(SuccSU);
+ ++TransPipeCount;
+ break;
+ }
+ }
+ }
+
+ if (!(TempExp && TempMFMA))
+ return false;
+
+ HasChainBetweenCvt =
+ std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
+ [&isCvt](SDep &Succ) {
+ return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
+ }) == (*TempExp)->Succs.end();
+
+ // Count the number of MFMAs that are reached by an EXP
+ for (auto &SuccSU : MFMAPipeCands) {
+ if (MFMAPipeSUs.size() &&
+ std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+ [&SuccSU](SUnit *PotentialMatch) {
+ return PotentialMatch->NodeNum == SuccSU->NodeNum;
+ }) != MFMAPipeSUs.end())
+ continue;
+
+ for (auto &PredSU : ExpPipeCands) {
+ if (DAG->IsReachable(SuccSU, PredSU)) {
+ MFMAPipeSUs.push_back(SuccSU);
+ break;
+ }
+ }
+ }
+
+ MFMAPipeCount = MFMAPipeSUs.size();
+
+ assert(TempExp && TempMFMA);
+
+ std::optional<SUnit *> TempCvt;
+ for (auto &SuccSU : CvtSUs) {
+ if (DAG->IsReachable(SuccSU, *TempExp)) {
+ TempCvt = SuccSU;
+ break;
+ }
+ }
+
+ HasCvt = false;
+ if (TempCvt.has_value()) {
+ for (auto &SuccSU : MFMAPipeSUs) {
+ if (DAG->IsReachable(SuccSU, *TempCvt)) {
+ HasCvt = true;
+ break;
+ }
+ }
+ }
+
+ MFMAChains = 0;
+ for (auto &MFMAPipeSU : MFMAPipeSUs) {
+ if (MFMAChainSeeds.size() &&
+ std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
+ MFMAChainSeeds.end())
+ continue;
+ if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
+ [&TII](SDep &Succ) {
+ return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+ })) {
+ MFMAChainSeeds.push_back(MFMAPipeSU);
+ ++MFMAChains;
+ }
+ }
+
+ if (!MFMAChains)
+ return false;
+
+ for (auto Pred : MFMAChainSeeds[0]->Preds) {
+ if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
+ Pred.getSUnit()->getInstr()->mayLoad())
+ FirstPipeDSR = Pred.getSUnit()->NodeNum;
+ }
+
+ MFMAChainLength = MFMAPipeCount / MFMAChains;
+
+ // The number of bit pack operations that depend on a single V_EXP
+ unsigned PackSuccCount = std::count_if(
+ PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
+ return DAG->IsReachable(VPack, *TempExp);
+ });
+
+ // The number of bit pack operations an MFMA depends on
+ unsigned PackPredCount =
+ std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+ [&isBitPack](SDep &Pred) {
+ auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+ return isBitPack(Opc);
+ });
+
+ auto PackPred =
+ std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+ [&isBitPack](SDep &Pred) {
+ auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+ return isBitPack(Opc);
+ });
+
+ if (PackPred == (*TempMFMA)->Preds.end())
+ return false;
+
+ MFMAEnablement = 0;
+ ExpRequirement = 0;
+ // How many MFMAs depend on a single bit pack operation
+ MFMAEnablement =
+ std::count_if(PackPred->getSUnit()->Succs.begin(),
+ PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
+ return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+ });
+
+ // The number of MFMAs that depend on a single V_EXP
+ MFMAEnablement *= PackSuccCount;
+
+ // The number of V_EXPs required to resolve all dependencies for an MFMA
+ ExpRequirement =
+ std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
+ [this, &PackPred](SUnit *ExpBase) {
+ return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
+ });
+
+ ExpRequirement *= PackPredCount;
+ return true;
+}
+
+bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) {
+ const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ if (Phase != AMDGPU::SchedulingPhase::PostRA)
+ MFMAChainSeeds.clear();
+ if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
+ return false;
+
+ return true;
+}
+
+bool MFMAExpInterleaveOpt::applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) {
+
+ bool IsSmallKernelType =
+ MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
+ bool IsLargeKernelType =
+ MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
+
+ if (!(IsSmallKernelType || IsLargeKernelType))
+ return false;
+
+ const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ unsigned PipelineSyncID = 0;
+ SchedGroup *SG = nullptr;
+
+ unsigned MFMAChain = 0;
+ unsigned PositionInChain = 0;
+ unsigned CurrMFMAForTransPosition = 0;
+
+ auto incrementTransPosition = [&MFMAChain, &PositionInChain,
+ &CurrMFMAForTransPosition]() {
+ CurrMFMAForTransPosition += MFMAEnablement;
+ PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
+ MFMAChain = CurrMFMAForTransPosition % MFMAChains;
+ };
+
+ auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+ auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+ return (TempMFMAForTrans / MFMAChains);
+ };
+
+ auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+ auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+ return TempMFMAForTrans % MFMAChains;
+ };
+
+ unsigned CurrMFMAPosition = 0;
+ unsigned MFMAChainForMFMA = 0;
+ unsigned PositionInChainForMFMA = 0;
+
+ auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
+ &PositionInChainForMFMA]() {
+ ++CurrMFMAPosition;
+ MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
+ PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
+ };
+
+ bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
+ assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
+
+ bool UsesFMA = IsSmallKernelType || !IsPostRA;
+ bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
+ bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
+
+ // PHASE 1: "Prefetch"
+ if (UsesFMA) {
+ // First Round FMA
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains) {
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+ true));
+ } else
+ SG->addRule(
+ std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ // Second Round FMA
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains) {
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ getNextTransPositionInChain(),
+ MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+ } else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
+ SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ if (UsesDSRead) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
+ SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // First Round EXP
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains)
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
+ else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+ HasChainBetweenCvt));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ incrementTransPosition();
+
+ // First Round CVT, Third Round FMA, Second Round EXP; interleaved
+ for (unsigned I = 0; I < ExpRequirement; I++) {
+ // First Round CVT
+ if (UsesCvt) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+ if (HasChainBetweenCvt)
+ SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+ 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+ else
+ SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
+ 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Third Round FMA
+ if (UsesFMA) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains) {
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ getNextTransPositionInChain(),
+ MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+ } else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
+ TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Second Round EXP
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains)
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+ true));
+ else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
+ SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+ HasChainBetweenCvt));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // The "extra" EXP which enables all MFMA
+ // TODO: UsesExtraExp
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
+ 8, TII, SG->getSGID(), HasChainBetweenCvt));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ // PHASE 2: Main Interleave Loop
+
+ // The number of MFMAs per iteration
+ auto MFMARatio =
+ MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
+ // The number of Exps per iteration
+ auto ExpRatio =
+ MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
+ // The reamaining Exps
+ auto RemainingExp = TransPipeCount - (2 * ExpRequirement);
+ auto ExpLoopCount = RemainingExp / ExpRatio;
+ // In loop MFMAs
+ auto MFMAInLoop = MFMAPipeCount - (MFMAEnablement * 2);
+ auto MFMALoopCount = MFMAInLoop / MFMARatio;
+ auto LoopSize = std::min(ExpLoopCount, MFMALoopCount);
+
+ for (unsigned I = 0; I < LoopSize; I++) {
+ if (!(I * ExpRatio % ExpRequirement))
+ incrementTransPosition();
+
+ // Round N MFMA
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains)
+ SG->addRule(std::make_shared<IsExactMFMA>(
+ PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
+ SG->getSGID(), true));
+ else
+ SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ incrementMFMAPosition();
+
+ if (UsesDSRead && !(I % 4)) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
+ SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // CVT, EXP, FMA Interleaving
+ for (unsigned J = 0; J < ExpRatio; J++) {
+ auto MFMAOffset = MFMARatio * (I + 1);
+ auto MaxMFMAOffset = ExpRequirement * MFMARatio / ExpRatio;
+
+ // Round N + 1 CVT
+ if (UsesCvt) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+ auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
+ auto DSROffset = I / 4 + 1;
+ auto MaxDSROffset = MaxMFMAOffset / 4;
+ // TODO: UsesExtraExp
+ auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
+ auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
+ std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
+ ExpOffset;
+ if (HasChainBetweenCvt)
+ SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+ CurrentOffset, TII, SG->getSGID()));
+ else
+ SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
+ SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Round N + 3 FMA
+ if (UsesFMA) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains)
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ getNextTransPositionInChain(),
+ MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
+ true));
+ else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(
+ (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
+ TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Round N + 2 Exp
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+ if (!IsPostRA && MFMAChains)
+ SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+ PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+ true));
+ else
+ SG->addRule(std::make_shared<EnablesNthMFMA>(
+ (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
+ TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+ HasChainBetweenCvt));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+ }
+
+ // PHASE 3: Remaining MFMAs
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ return true;
+}
+
class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
private:
// Whether the DS_READ is a predecessor of first four MFMA in region
@@ -1103,9 +2019,12 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
bool applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- IGLPPhase Phase) override;
+ AMDGPU::SchedulingPhase Phase) override;
- bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override {
+ return true;
+ }
MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
: IGLPStrategy(DAG, TII) {
@@ -1120,14 +2039,15 @@ static unsigned DSWWithSharedVMEMCount = 0;
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- IGLPPhase Phase) {
+ AMDGPU::SchedulingPhase Phase) {
unsigned MFMACount = 0;
unsigned DSRCount = 0;
- assert(
- (Phase != IGLPPhase::Initial || (DSWCount == 0 && DSWWithPermCount == 0 &&
- DSWWithSharedVMEMCount == 0)) &&
- "DSWCounters should be zero in pre-RA scheduling!");
+ bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
+
+ assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+ DSWWithSharedVMEMCount == 0)) &&
+ "DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto I = SU.getInstr();
@@ -1136,7 +2056,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
- else if (I->mayStore() && Phase == IGLPPhase::Initial) {
+ else if (I->mayStore() && IsInitial) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1149,7 +2069,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}
- if (Phase == IGLPPhase::Initial) {
+ if (IsInitial) {
DSWWithPermCount = DSWithPerms.size();
auto I = DSWithPerms.begin();
auto E = DSWithPerms.end();
@@ -1257,14 +2177,14 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
- SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1, TII, SG->getSGID(), true));
- SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1275,7 +2195,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
3, TII, SG->getSGID(), true));
- SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1293,7 +2213,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
- SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1314,7 +2234,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
- SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1328,7 +2248,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
- SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1339,7 +2259,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2, TII, SG->getSGID(), true));
- SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1350,7 +2270,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
4, TII, SG->getSGID(), true));
- SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1369,6 +2289,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
case MFMASmallGemmSingleWaveOptID:
return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+ case MFMAExpInterleave:
+ return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
}
llvm_unreachable("Unknown IGLPStrategyID");
@@ -1420,12 +2342,12 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
// first created SchedGroup first.
bool IsBottomUp = 1;
- // Whether or not this is a reentry into the IGroupLPDAGMutation.
- IGLPPhase Phase = IGLPPhase::Initial;
+ // The scheduling phase this application of IGLP corresponds with.
+ AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
IGroupLPDAGMutation() = default;
IGroupLPDAGMutation(
- IGLPPhase Phase,
+ AMDGPU::SchedulingPhase Phase,
std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
: SavedMutations(SavedMutations), Phase(Phase) {}
};
@@ -1601,7 +2523,6 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
auto &SU = *I;
if (isFull())
break;
-
if (canAddSU(SU))
SyncedInstrs[&SU].push_back(SGID);
}
@@ -1735,7 +2656,7 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
auto S = createIGLPStrategy(StrategyID, DAG, TII);
- if (!S->shouldApplyStrategy(DAG))
+ if (!S->shouldApplyStrategy(DAG, Phase))
return false;
IsBottomUp = S->IsBottomUp;
@@ -1752,7 +2673,7 @@ namespace llvm {
/// scheduling "phases"), we can reenter this mutation framework more than once
/// for a given region.
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
- IGLPPhase Phase,
+ AMDGPU::SchedulingPhase Phase,
std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index ec48430ba3684b..46ef4d702d0022 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -15,12 +15,13 @@
namespace llvm {
-// Components of the mask that determines which instruction types may be may be
-// classified into a SchedGroup.
-enum class IGLPPhase { Initial = 0u, PreRAReentry = 1u << 0, PostRA = 1u << 1 };
+namespace AMDGPU {
+// The current phase of instruction scheduling
+enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
+} // namespace AMDGPU
std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
- IGLPPhase Phase,
+ AMDGPU::SchedulingPhase Phase,
std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9ba12171396f51..a0c6e3a1b49b75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr));
+ DAG->addMutation(
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -471,7 +472,8 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::Initial, nullptr));
+ DAG->addMutation(
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
return DAG;
}
@@ -934,7 +936,8 @@ class GCNPassConfig final : public AMDGPUPassConfig {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
- DAG->addMutation(createIGroupLPDAGMutation(IGLPPhase::PostRA, nullptr));
+ DAG->addMutation(
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 8ce5a4cb58586d..3f3550d3029aad 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(createIGroupLPDAGMutation(IGLPPhase::PreRAReentry, nullptr));
+ DAG.addMutation(createIGroupLPDAGMutation(
+ AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -856,7 +857,8 @@ bool GCNSchedStage::initGCNRegion() {
bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule;
DAG.addMutation(createIGroupLPDAGMutation(
- IsInitialStage ? IGLPPhase::Initial : IGLPPhase::PreRAReentry,
+ IsInitialStage ? AMDGPU::SchedulingPhase::Initial
+ : AMDGPU::SchedulingPhase::PreRAReentry,
&SavedMutations));
}
@@ -1571,7 +1573,7 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
- addMutation(createIGroupLPDAGMutation(/*IsReentry=*/IGLPPhase::PostRA,
+ addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
&SavedMutations));
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
new file mode 100644
index 00000000000000..da1d9972e42dcf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -0,0 +1,2055 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+ define amdgpu_kernel void @largeInterleave() #0 { ret void }
+ ; GCN-LABEL: largeInterleave:
+ ; GCN: ; %bb.0:
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr2
+ ; GCN-NEXT: ; implicit-def: $vgpr1
+ ; GCN-NEXT: ; implicit-def: $vgpr8
+ ; GCN-NEXT: ; implicit-def: $vgpr94
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: ; implicit-def: $vgpr106
+ ; GCN-NEXT: ; implicit-def: $vgpr132
+ ; GCN-NEXT: ; implicit-def: $vgpr133
+ ; GCN-NEXT: ; implicit-def: $vgpr139
+ ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN-NEXT: v_readfirstlane_b32 s7, v0
+ ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2
+ ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
+ ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1
+ ; GCN-NEXT: v_add_u32_e32 v93, s0, v92
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: s_lshl_b32 s0, s7, 7
+ ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1
+ ; GCN-NEXT: v_add_u32_e32 v8, 64, v93
+ ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: ; kill: killed $vgpr92
+ ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v95, v[0:3]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+ ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ; kill: killed $vgpr72
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ; implicit-def: $vgpr64
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93
+ ; GCN-NEXT: ; implicit-def: $vgpr73
+ ; GCN-NEXT: v_add_u32_e32 v76, v132, v64
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; kill: killed $vgpr72
+ ; GCN-NEXT: v_add_u32_e32 v72, v132, v73
+ ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr74
+ ; GCN-NEXT: v_add_u32_e32 v72, v132, v74
+ ; GCN-NEXT: ; implicit-def: $vgpr75
+ ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v72, v132, v75
+ ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; kill: killed $vgpr76
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: ; implicit-def: $sgpr8
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[64:67], v94
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
+ ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[94:97], v106
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
+ ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
+ ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5
+ ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8
+ ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5
+ ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
+ ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8
+ ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8
+ ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5
+ ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
+ ; GCN-NEXT: s_nop 5
+ ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48
+ ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49
+ ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101
+ ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50
+ ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51
+ ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
+ ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53
+ ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55
+ ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56
+ ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
+ ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
+ ; GCN-NEXT: s_nop 6
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39
+ ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40
+ ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80
+ ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43
+ ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
+ ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45
+ ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
+ ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46
+ ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47
+ ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
+ ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16
+ ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17
+ ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
+ ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20
+ ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21
+ ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22
+ ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23
+ ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24
+ ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25
+ ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26
+ ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27
+ ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14
+ ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
+ ; GCN-NEXT: ; implicit-def: $vgpr65
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: ; implicit-def: $vgpr68
+ ; GCN-NEXT: ; implicit-def: $vgpr67
+ ; GCN-NEXT: v_add_u32_e32 v65, s7, v65
+ ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65
+ ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6
+ ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1
+ ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
+ ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v135, v[94:95]
+ ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
+ ; GCN-NEXT: v_max_f32_e32 v64, v64, v65
+ ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v136, v[98:99]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v137, v[102:103]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v138, v[96:97]
+ ; GCN-NEXT: v_add_u32_e32 v68, v132, v68
+ ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7]
+ ; GCN-NEXT: v_max_f32_e32 v64, v64, v64
+ ; GCN-NEXT: ; implicit-def: $vgpr65
+ ; GCN-NEXT: v_max_f32_e32 v66, v65, v65
+ ; GCN-NEXT: v_max_f32_e32 v134, v66, v64
+ ; GCN-NEXT: ; implicit-def: $vgpr64
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v64, v132, v64
+ ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: v_add_u32_e32 v64, v132, v66
+ ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v64, v132, v67
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
+ ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
+ ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+ ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v163, v57
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+ ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v164, v57
+ ; GCN-NEXT: v_exp_f32_e32 v49, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+ ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v50, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+ ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v51, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+ ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v52, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v53, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134
+ ; GCN-NEXT: ds_read_b128 v[140:143], v139
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v54, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
+ ; GCN-NEXT: v_exp_f32_e32 v55, v48
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
+ ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v56, v48
+ ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
+ ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+ ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v48, v48
+ ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
+ ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
+ ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
+ ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
+ ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+ ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
+ ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
+ ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+ ; GCN-NEXT: v_exp_f32_e32 v58, v58
+ ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+ ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
+ ; GCN-NEXT: v_exp_f32_e32 v59, v57
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
+ ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
+ ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+ ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
+ ; GCN-NEXT: ; implicit-def: $vgpr57
+ ; GCN-NEXT: ds_read_b128 v[60:63], v57
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v160, v149
+ ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+ ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
+ ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
+ ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
+ ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
+ ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
+ ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
+ ; GCN-NEXT: v_exp_f32_e32 v162, v146
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
+ ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
+ ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+ ; GCN-NEXT: v_exp_f32_e32 v151, v33
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
+ ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
+ ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
+ ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
+ ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
+ ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v153, v33
+ ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
+ ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
+ ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161
+ ; GCN-NEXT: v_exp_f32_e32 v165, v60
+ ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8
+ ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v161, v61
+ ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8
+ ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5
+ ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8
+ ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v135, v[36:37]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
+ ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5
+ ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151
+ ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153
+ ; GCN-NEXT: v_exp_f32_e32 v159, v33
+ ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38
+ ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152
+ ; GCN-NEXT: v_exp_f32_e32 v152, v38
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v136, v[60:61]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v137, v[32:33]
+ ; GCN-NEXT: ; implicit-def: $vgpr33
+ ; GCN-NEXT: ; implicit-def: $vgpr38
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v138, v[140:141]
+ ; GCN-NEXT: v_add_u32_e32 v38, v132, v38
+ ; GCN-NEXT: v_add_u32_e32 v33, v132, v33
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr36
+ ; GCN-NEXT: v_add_u32_e32 v33, v132, v36
+ ; GCN-NEXT: ; implicit-def: $vgpr37
+ ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v33, v132, v37
+ ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165
+ ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156
+ ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134
+ ; GCN-NEXT: ds_read_b128 v[36:39], v139
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
+ ; GCN-NEXT: v_exp_f32_e32 v154, v32
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+ ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134
+ ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v155, v32
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161
+ ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159
+ ; GCN-NEXT: v_exp_f32_e32 v157, v32
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+ ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142
+ ; GCN-NEXT: v_exp_f32_e32 v146, v32
+ ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134
+ ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40
+ ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v147, v36
+ ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v143, v36
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155
+ ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+ ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157
+ ; GCN-NEXT: v_exp_f32_e32 v156, v32
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146
+ ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32
+ ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+ ; GCN-NEXT: v_exp_f32_e32 v129, v36
+ ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147
+ ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[36:39], v57
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v142, v40
+ ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143
+ ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
+ ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+ ; GCN-NEXT: v_exp_f32_e32 v63, v40
+ ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61
+ ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134
+ ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156
+ ; GCN-NEXT: v_exp_f32_e32 v158, v17
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129
+ ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
+ ; GCN-NEXT: v_exp_f32_e32 v128, v17
+ ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8
+ ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16
+ ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62
+ ; GCN-NEXT: v_exp_f32_e32 v167, v36
+ ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8
+ ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v130, v37
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
+ ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
+ ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v135, v[20:21]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
+ ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
+ ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+ ; GCN-NEXT: v_exp_f32_e32 v140, v17
+ ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v136, v[36:37]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+ ; GCN-NEXT: v_exp_f32_e32 v144, v22
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v137, v[16:17]
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v138, v[42:43]
+ ; GCN-NEXT: v_add_u32_e32 v22, v132, v22
+ ; GCN-NEXT: v_add_u32_e32 v17, v132, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v20, v132, v20
+ ; GCN-NEXT: v_add_u32_e32 v21, v132, v21
+ ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
+ ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+ ; GCN-NEXT: v_exp_f32_e32 v132, v16
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167
+ ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134
+ ; GCN-NEXT: ds_read_b128 v[20:23], v139
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v62, v16
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130
+ ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134
+ ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134
+ ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140
+ ; GCN-NEXT: v_exp_f32_e32 v145, v16
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+ ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46
+ ; GCN-NEXT: v_exp_f32_e32 v35, v16
+ ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24
+ ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v46, v20
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v47, v20
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34
+ ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145
+ ; GCN-NEXT: v_exp_f32_e32 v141, v16
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35
+ ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16
+ ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
+ ; GCN-NEXT: v_exp_f32_e32 v33, v20
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28
+ ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[20:23], v57
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v36, v24
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47
+ ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
+ ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_exp_f32_e32 v39, v24
+ ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37
+ ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141
+ ; GCN-NEXT: v_exp_f32_e32 v148, v1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33
+ ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
+ ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v34, v1
+ ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8
+ ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0
+ ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38
+ ; GCN-NEXT: v_exp_f32_e32 v150, v20
+ ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v38, v21
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39
+ ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134
+ ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5
+ ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v135, v[4:5]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
+ ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5
+ ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_exp_f32_e32 v42, v1
+ ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v137, v[0:1]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v138, v[26:27]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
+ ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
+ ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v25, v6
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[4:7], v139
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+ ; GCN-NEXT: v_exp_f32_e32 v26, v0
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38
+ ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v29, v0
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41
+ ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42
+ ; GCN-NEXT: v_exp_f32_e32 v31, v0
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
+ ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+ ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
+ ; GCN-NEXT: v_exp_f32_e32 v19, v0
+ ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
+ ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v24, v4
+ ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
+ ; GCN-NEXT: v_exp_f32_e32 v27, v4
+ ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
+ ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
+ ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
+ ; GCN-NEXT: v_exp_f32_e32 v30, v0
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
+ ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
+ ; GCN-NEXT: v_exp_f32_e32 v16, v4
+ ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
+ ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
+ ; GCN-NEXT: v_exp_f32_e32 v18, v9
+ ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_exp_f32_e32 v21, v9
+ ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[4:7], v57
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+ ; GCN-NEXT: v_exp_f32_e32 v2, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8
+ ; GCN-NEXT: v_exp_f32_e32 v10, v1
+ ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20
+ ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0
+ ; GCN-NEXT: v_add_f32_e32 v3, 0, v49
+ ; GCN-NEXT: v_add_f32_e32 v3, v50, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v51, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v52, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v53, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v54, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v55, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v56, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v58, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v163, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v164, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v59, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v160, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v162, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v151, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v153, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v165, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v161, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v159, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v152, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v154, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v155, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v157, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v146, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v147, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v143, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v156, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v129, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v142, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v63, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v158, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v128, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v167, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v130, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v140, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v144, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v132, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v62, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v145, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v35, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v46, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v47, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v141, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v33, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v36, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v39, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v148, v3
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
+ ; GCN-NEXT: v_add_f32_e32 v3, v34, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v150, v3
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
+ ; GCN-NEXT: v_add_f32_e32 v3, v38, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v42, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v25, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v26, v3
+ ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
+ ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
+ ; GCN-NEXT: v_add_f32_e32 v3, v29, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v31, v3
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
+ ; GCN-NEXT: v_add_f32_e32 v3, v19, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v24, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v27, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v30, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v16, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v18, v3
+ ; GCN-NEXT: v_add_f32_e32 v3, v21, v3
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+ ; GCN-NEXT: v_add_f32_e32 v0, v2, v3
+ ; GCN-NEXT: v_add_f32_e32 v4, v10, v0
+ ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_f32_e32 v2, v4, v5
+ ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+ ; GCN-NEXT: ; implicit-def: $vgpr4
+ ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48
+ ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: s_endpgm
+ attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+ !0 = !{i64 2862105}
+
+...
+
+---
+name: largeInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 7
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+ %11:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_512 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %443:sgpr_128 = IMPLICIT_DEF
+ %18:sreg_32 = IMPLICIT_DEF
+ %25:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ %391:vreg_128_align2 = IMPLICIT_DEF
+ %24:vgpr_32 = IMPLICIT_DEF
+ %392:vreg_128_align2 = IMPLICIT_DEF
+ %401:vreg_128_align2 = IMPLICIT_DEF
+ %406:vreg_128_align2 = IMPLICIT_DEF
+ %48:vgpr_32 = IMPLICIT_DEF
+ %473:sgpr_128 = IMPLICIT_DEF
+ %411:vreg_128_align2 = IMPLICIT_DEF
+ %416:vreg_128_align2 = IMPLICIT_DEF
+ %421:vreg_128_align2 = IMPLICIT_DEF
+ %426:vreg_128_align2 = IMPLICIT_DEF
+ %1114:sgpr_32 = IMPLICIT_DEF
+ %39:vgpr_32 = IMPLICIT_DEF
+ %484:sreg_64_xexec = IMPLICIT_DEF
+ %3346:vgpr_32 = IMPLICIT_DEF
+ %1422:sreg_32 = IMPLICIT_DEF
+ %1424:sreg_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %494:sreg_32 = IMPLICIT_DEF
+ %47:vgpr_32 = IMPLICIT_DEF
+ %41:vgpr_32 = IMPLICIT_DEF
+ %42:vgpr_32 = IMPLICIT_DEF
+ %43:vgpr_32 = IMPLICIT_DEF
+ %44:vgpr_32 = IMPLICIT_DEF
+ %45:vgpr_32 = IMPLICIT_DEF
+ %50:sreg_32 = IMPLICIT_DEF
+ %3347:vgpr_32 = IMPLICIT_DEF
+ %3329:vgpr_32 = IMPLICIT_DEF
+ %3330:vgpr_32 = IMPLICIT_DEF
+ %3331:vgpr_32 = IMPLICIT_DEF
+ %3332:vgpr_32 = IMPLICIT_DEF
+ %3333:vgpr_32 = IMPLICIT_DEF
+ %2986:vreg_512_align2 = IMPLICIT_DEF
+ %3038:vreg_512_align2 = IMPLICIT_DEF
+ %2980:vreg_512_align2 = IMPLICIT_DEF
+ %3003:vreg_512_align2 = IMPLICIT_DEF
+ %3334:vgpr_32 = IMPLICIT_DEF
+ %3335:vgpr_32 = IMPLICIT_DEF
+ %3336:vgpr_32 = IMPLICIT_DEF
+ %3337:vgpr_32 = IMPLICIT_DEF
+ %3338:vgpr_32 = IMPLICIT_DEF
+ %3339:vgpr_32 = IMPLICIT_DEF
+ %3345:vgpr_32 = IMPLICIT_DEF
+ %3340:vgpr_32 = IMPLICIT_DEF
+ %3341:vgpr_32 = IMPLICIT_DEF
+ %3342:vgpr_32 = IMPLICIT_DEF
+ %3343:vgpr_32 = IMPLICIT_DEF
+ %3344:vgpr_32 = IMPLICIT_DEF
+ %84:vgpr_32 = COPY %3347
+ %86:vgpr_32 = COPY %3347:vgpr_32
+ IGLP_OPT 2
+ %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec
+ %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec
+ %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec
+ %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec
+ %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec
+ %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc
+ %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec
+ %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec
+ %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec
+ %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+ early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+ early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+ early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+ early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec
+ %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec
+ %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec
+ %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec
+ %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec
+ %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec
+ %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec
+ %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec
+ %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec
+ %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec
+ %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec
+ DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec
+ %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec
+ %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec
+ %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec
+ %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec
+ %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec
+ %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec
+ %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec
+ %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec
+ %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec
+ %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec
+ %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec
+ %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec
+ %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec
+ %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec
+ %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec
+ %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec
+ %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec
+ %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec
+ %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec
+ %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec
+ %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec
+ %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec
+ %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec
+ %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec
+ %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec
+ %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec
+ %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec
+ %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec
+ %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec
+ %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec
+ %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec
+ %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec
+ %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec
+ %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec
+ %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec
+ %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec
+ %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec
+ %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec
+ %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec
+ %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec
+ %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec
+ %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec
+ %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec
+ %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec
+ %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec
+ %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec
+ %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec
+ %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec
+ %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec
+ %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec
+ %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec
+ %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec
+ %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec
+ %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec
+ %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec
+ %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec
+ %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec
+ %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec
+ %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec
+ %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec
+ %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec
+ %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec
+ %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec
+ %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec
+ %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec
+ %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec
+ %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec
+ %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec
+ %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec
+ %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec
+ %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec
+ %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec
+ %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec
+ %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec
+ %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec
+ %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec
+ %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec
+ %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec
+ %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec
+ %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec
+ %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec
+ %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec
+ %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec
+ %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec
+ %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec
+ %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec
+ %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec
+ %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec
+ %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec
+ %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec
+ %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec
+ %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec
+ %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec
+ %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec
+ %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec
+ %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec
+ %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec
+ %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec
+ %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec
+ %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec
+ %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec
+ %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec
+ %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec
+ %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec
+ %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec
+ %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec
+ %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec
+ %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec
+ %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec
+ %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec
+ %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec
+ %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec
+ %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec
+ %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec
+ %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec
+ %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec
+ %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec
+ %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec
+ %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec
+ %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec
+ %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec
+ %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec
+ %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec
+ %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec
+ %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec
+ %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec
+ %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec
+ %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec
+ %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec
+ %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec
+ %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec
+ %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec
+ %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec
+ %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec
+ %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec
+ %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec
+ %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec
+ %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec
+ %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec
+ %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec
+ %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec
+ %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec
+ %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec
+ %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec
+ %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec
+ %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec
+ %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec
+ %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec
+ %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec
+ %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec
+ %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec
+ %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec
+ %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec
+ %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec
+ %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec
+ %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec
+ %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec
+ %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec
+ %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec
+ %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec
+ %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec
+ %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec
+ %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec
+ %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec
+ %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec
+ %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec
+ %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec
+ %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec
+ %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec
+ %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec
+ %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec
+ %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec
+ %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec
+ %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec
+ %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec
+ %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec
+ %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec
+ %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec
+ %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec
+ %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec
+ %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec
+ %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec
+ %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec
+ %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec
+ %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec
+ %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec
+ %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec
+ %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec
+ %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec
+ %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec
+ %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec
+ %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec
+ %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec
+ %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec
+ %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec
+ %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec
+ %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec
+ %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec
+ %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec
+ %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec
+ %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec
+ %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec
+ %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec
+ %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec
+ %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec
+ undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec
+ %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec
+ %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec
+ %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec
+ DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec
+ %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec
+ %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec
+ %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec
+ %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec
+ %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec
+ %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec
+ %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec
+ %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec
+ %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec
+ %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec
+ %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec
+ %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec
+ %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec
+ %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec
+ %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec
+ %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec
+ %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec
+ %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec
+ %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec
+ %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec
+ %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec
+ %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec
+ %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec
+ %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec
+ %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec
+ %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec
+ %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec
+ %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec
+ %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec
+ %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec
+ %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec
+ %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec
+ %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec
+ %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec
+ %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec
+ %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec
+ %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec
+ %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec
+ %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec
+ %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec
+ %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec
+ %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec
+ %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec
+ %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec
+ %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+ %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+ %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+ %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+ %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+ %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+ %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+ %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec
+ %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec
+ %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec
+ %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec
+ %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec
+ %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+ %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+ %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+ %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+ %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+ %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+ %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+ %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec
+ %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec
+ %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec
+ %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec
+ %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec
+ %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+ %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+ %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+ %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+ %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+ %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+ %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+ %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+ undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+ %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+ DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec
+ %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec
+ %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec
+ %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec
+ %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec
+ %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec
+ %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec
+ %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec
+ %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec
+ %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec
+ %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec
+ %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec
+ %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec
+ undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec
+ %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec
+ %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec
+ %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec
+ %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec
+ %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec
+ %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec
+ %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec
+ %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec
+ %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec
+ %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec
+ %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec
+ %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec
+ %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec
+ %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec
+ %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec
+ %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec
+ %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec
+ %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec
+ %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec
+ %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec
+ %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec
+ %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec
+ %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec
+ %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec
+ %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec
+ %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec
+ %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec
+ %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec
+ %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec
+ %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec
+ %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
new file mode 100644
index 00000000000000..9c927a50b1cd01
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -0,0 +1,901 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+ define amdgpu_kernel void @smallInterleave() #0 { ret void }
+ ; GCN-LABEL: smallInterleave:
+ ; GCN: ; %bb.0:
+ ; GCN-NEXT: ; implicit-def: $vgpr2
+ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN-NEXT: v_readfirstlane_b32 s20, v2
+ ; GCN-NEXT: ; implicit-def: $sgpr4
+ ; GCN-NEXT: ; implicit-def: $vgpr3
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: ; implicit-def: $vgpr50
+ ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+ ; GCN-NEXT: ; implicit-def: $vgpr49
+ ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+ ; GCN-NEXT: ; implicit-def: $vgpr51
+ ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+ ; GCN-NEXT: ; implicit-def: $vgpr76
+ ; GCN-NEXT: ; implicit-def: $vgpr77
+ ; GCN-NEXT: ; implicit-def: $vgpr78
+ ; GCN-NEXT: ; implicit-def: $vgpr79
+ ; GCN-NEXT: ; implicit-def: $vgpr80
+ ; GCN-NEXT: ; implicit-def: $vgpr92
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3
+ ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: s_lshl_b32 s4, s20, 7
+ ; GCN-NEXT: ; implicit-def: $vgpr5
+ ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1
+ ; GCN-NEXT: v_add_u32_e32 v76, s20, v76
+ ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v48, v[0:3]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr1
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: v_add_u32_e32 v0, v0, v50
+ ; GCN-NEXT: v_add_u32_e32 v1, v1, v50
+ ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ds_read_b128 v[36:39], v49
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
+ ; GCN-NEXT: ; kill: killed $vgpr1
+ ; GCN-NEXT: ; kill: killed $vgpr0
+ ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6
+ ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1
+ ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77
+ ; GCN-NEXT: ; implicit-def: $sgpr2
+ ; GCN-NEXT: ; implicit-def: $sgpr3
+ ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[36:39], v51
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+ ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b128 v48, v[32:35]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[32:35], v49
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+ ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+ ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[68:71], v51
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+ ; GCN-NEXT: ; implicit-def: $vgpr32
+ ; GCN-NEXT: ; implicit-def: $vgpr33
+ ; GCN-NEXT: v_add_u32_e32 v82, v32, v50
+ ; GCN-NEXT: v_add_u32_e32 v83, v33, v50
+ ; GCN-NEXT: ; kill: killed $vgpr82
+ ; GCN-NEXT: ; kill: killed $vgpr83
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
+ ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr66
+ ; GCN-NEXT: ; implicit-def: $vgpr67
+ ; GCN-NEXT: v_max_f32_e32 v81, v67, v67
+ ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
+ ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2
+ ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3
+ ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b32 v76, v70
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v77, v71
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v78, v72
+ ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+ ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
+ ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65
+ ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21
+ ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23
+ ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24
+ ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25
+ ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27
+ ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28
+ ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29
+ ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
+ ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31
+ ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1
+ ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84
+ ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3
+ ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5
+ ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65
+ ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6
+ ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7
+ ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9
+ ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10
+ ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11
+ ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85
+ ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13
+ ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14
+ ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15
+ ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68
+ ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74
+ ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64
+ ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v79, v68
+ ; GCN-NEXT: ; implicit-def: $vgpr84
+ ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
+ ; GCN-NEXT: v_max_f32_e32 v70, v64, v65
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+ ; GCN-NEXT: v_max_f32_e32 v70, v70, v70
+ ; GCN-NEXT: v_max_f32_e32 v72, v81, v70
+ ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72
+ ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72
+ ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+ ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+ ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+ ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72
+ ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72
+ ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72
+ ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72
+ ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v73, v16
+ ; GCN-NEXT: v_exp_f32_e32 v74, v18
+ ; GCN-NEXT: v_exp_f32_e32 v75, v19
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+ ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+ ; GCN-NEXT: v_exp_f32_e32 v80, v20
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73
+ ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v81, v21
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74
+ ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v82, v22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+ ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72
+ ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+ ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72
+ ; GCN-NEXT: v_exp_f32_e32 v83, v23
+ ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v85, v22
+ ; GCN-NEXT: v_exp_f32_e32 v17, v17
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17
+ ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v88, v23
+ ; GCN-NEXT: v_fma_f32 v30, s4, v30, -v72
+ ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19
+ ; GCN-NEXT: v_exp_f32_e32 v16, v24
+ ; GCN-NEXT: ds_read_b128 v[18:21], v84
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v30, 0x3fb8aa3b, v30
+ ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83
+ ; GCN-NEXT: v_fma_f32 v19, s4, v28, -v72
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v80
+ ; GCN-NEXT: v_fma_f32 v31, s4, v31, -v72
+ ; GCN-NEXT: v_perm_b32 v90, v68, v64, s2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v26
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81
+ ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v22, v22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v82
+ ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3
+ ; GCN-NEXT: v_perm_b32 v91, v69, v65, s2
+ ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3
+ ; GCN-NEXT: ds_read_b128 v[26:29], v92
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[68:71], v92 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: ds_write_b32 v76, v90
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v77, v64
+ ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v67
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v78, v91
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b32 v79, v65
+ ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v19
+ ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v87
+ ; GCN-NEXT: v_pack_b32_f16 v19, v23, v89
+ ; GCN-NEXT: v_pack_b32_f16 v18, v18, v86
+ ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v64, v64
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v31
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v85
+ ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v23, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v31, v88
+ ; GCN-NEXT: v_fma_f32 v3, s4, v3, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v65, v67
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22
+ ; GCN-NEXT: v_fma_f32 v4, s4, v4, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v19, v30
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v64
+ ; GCN-NEXT: v_mul_f32_e32 v25, 0x3fb8aa3b, v0
+ ; GCN-NEXT: v_mul_f32_e32 v30, 0x3fb8aa3b, v1
+ ; GCN-NEXT: v_pack_b32_f16 v0, v21, v31
+ ; GCN-NEXT: v_pack_b32_f16 v1, v18, v24
+ ; GCN-NEXT: v_fma_f32 v5, s4, v5, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v20, v20
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v23
+ ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v21, v25
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v65
+ ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v25, v30
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
+ ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v26, v2
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20
+ ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v4
+ ; GCN-NEXT: v_pack_b32_f16 v1, v0, v1
+ ; GCN-NEXT: v_pack_b32_f16 v0, v18, v24
+ ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v27, v3
+ ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[0:1], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v21
+ ; GCN-NEXT: v_fma_f32 v5, s4, v8, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v18, v2
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25
+ ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v24, v3
+ ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[0:1], v[32:47]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[0:3], v84
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v26
+ ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v29, v6
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27
+ ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+ ; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5
+ ; GCN-NEXT: v_pack_b32_f16 v8, v4, v8
+ ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v30, v7
+ ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v9
+ ; GCN-NEXT: v_pack_b32_f16 v9, v28, v6
+ ; GCN-NEXT: ; implicit-def: $sgpr2
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18
+ ; GCN-NEXT: v_exp_f32_e32 v28, v5
+ ; GCN-NEXT: v_exp_f32_e32 v67, v7
+ ; GCN-NEXT: ds_read_b128 v[4:7], v84 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v10
+ ; GCN-NEXT: v_mul_f32_e32 v10, 0x3fb8aa3b, v11
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v9, s4, v15, -v72
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v31, v24
+ ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v29
+ ; GCN-NEXT: v_exp_f32_e32 v8, v1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v30
+ ; GCN-NEXT: v_pack_b32_f16 v0, v0, v31
+ ; GCN-NEXT: v_exp_f32_e32 v10, v10
+ ; GCN-NEXT: v_pack_b32_f16 v1, v4, v1
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v12
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v28
+ ; GCN-NEXT: v_exp_f32_e32 v11, v2
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v13
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v12, v67
+ ; GCN-NEXT: v_exp_f32_e32 v13, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+ ; GCN-NEXT: v_add_f32_e32 v0, 0, v73
+ ; GCN-NEXT: v_add_f32_e32 v0, v17, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v74, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v75, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v80, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v81, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v82, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v83, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v85, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v88, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v22, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v64, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v23, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v65, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v19, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v20, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v21, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v25, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v26, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v27, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v18, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v24, v0
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5
+ ; GCN-NEXT: v_add_f32_e32 v0, v29, v0
+ ; GCN-NEXT: v_exp_f32_e32 v6, v1
+ ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v9
+ ; GCN-NEXT: v_add_f32_e32 v0, v30, v0
+ ; GCN-NEXT: v_exp_f32_e32 v14, v1
+ ; GCN-NEXT: v_add_f32_e32 v9, v28, v0
+ ; GCN-NEXT: ds_read_b128 v[0:3], v92
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v10
+ ; GCN-NEXT: v_add_f32_e32 v9, v67, v9
+ ; GCN-NEXT: v_add_f32_e32 v8, v8, v9
+ ; GCN-NEXT: v_add_f32_e32 v8, v10, v8
+ ; GCN-NEXT: v_add_f32_e32 v10, v11, v8
+ ; GCN-NEXT: v_pack_b32_f16 v9, v5, v7
+ ; GCN-NEXT: v_pack_b32_f16 v8, v4, v12
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v15, v14
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v13
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11
+ ; GCN-NEXT: v_add_f32_e32 v1, v13, v10
+ ; GCN-NEXT: v_add_f32_e32 v10, v6, v1
+ ; GCN-NEXT: v_pack_b32_f16 v1, v0, v15
+ ; GCN-NEXT: v_pack_b32_f16 v0, v4, v17
+ ; GCN-NEXT: ds_read_b128 v[4:7], v92 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+ ; GCN-NEXT: v_mov_b32_e32 v4, 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+ ; GCN-NEXT: v_add_f32_e32 v2, v14, v10
+ ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
+ ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+ ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16
+ ; GCN-NEXT: s_endpgm
+ attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+ !0 = !{i64 2862105}
+
+...
+
+---
+name: smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = COPY %0:vgpr_32
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ %5:sgpr_128 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ %8:sgpr_128 = IMPLICIT_DEF
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:sgpr_512 = IMPLICIT_DEF
+ %11:sgpr_32 = IMPLICIT_DEF
+ %12:sreg_64_xexec = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:sreg_32 = IMPLICIT_DEF
+ %15:sreg_32 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %17:sreg_32 = IMPLICIT_DEF
+ %18:vgpr_32 = IMPLICIT_DEF
+ %19:vgpr_32 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ %24:vgpr_32 = IMPLICIT_DEF
+ %25:vgpr_32 = IMPLICIT_DEF
+ %26:sreg_32 = IMPLICIT_DEF
+ %42:vgpr_32 = IMPLICIT_DEF
+ %44:vreg_128_align2 = IMPLICIT_DEF
+ %48:vgpr_32 = IMPLICIT_DEF
+ %49:vreg_128_align2 = IMPLICIT_DEF
+ %52:vreg_128_align2 = IMPLICIT_DEF
+ %55:vreg_128_align2 = IMPLICIT_DEF
+ %106:vgpr_32 = IMPLICIT_DEF
+ %29:vgpr_32 = IMPLICIT_DEF
+ %37:vgpr_32 = IMPLICIT_DEF
+ %259:vreg_512_align2 = IMPLICIT_DEF
+ %260:vreg_512_align2 = IMPLICIT_DEF
+ IGLP_OPT 2
+ %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec
+ %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec
+ %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec
+ %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc
+ %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec
+ DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec
+ %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec
+ %36:vgpr_32 = V_ADD_U32_e32 %7:vgpr_32, %37:vgpr_32, implicit $exec
+ %38:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %36:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %39:vgpr_32 = V_ADD_U32_e32 %9:vgpr_32, %37:vgpr_32, implicit $exec
+ %40:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %39:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %41:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+ early-clobber %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %41.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %41.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %45:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+ early-clobber %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %45.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %45.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %47:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %50:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ DS_WRITE_B128_gfx9 %34:vgpr_32, %35:vreg_128_align2, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %51:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %53:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %54:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %56:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %57:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %58:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %59:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %60:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %61:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %62:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %63:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %64:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %65:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %66:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %67:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %68:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %69:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %70:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %71:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %72:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %73:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub0:vreg_512_align2, implicit $mode, implicit $exec
+ %74:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub1:vreg_512_align2, implicit $mode, implicit $exec
+ %75:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub2:vreg_512_align2, implicit $mode, implicit $exec
+ %76:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub3:vreg_512_align2, implicit $mode, implicit $exec
+ %77:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub4:vreg_512_align2, implicit $mode, implicit $exec
+ %78:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub5:vreg_512_align2, implicit $mode, implicit $exec
+ %79:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub6:vreg_512_align2, implicit $mode, implicit $exec
+ %80:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub7:vreg_512_align2, implicit $mode, implicit $exec
+ %81:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub8:vreg_512_align2, implicit $mode, implicit $exec
+ %82:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub9:vreg_512_align2, implicit $mode, implicit $exec
+ %83:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub10:vreg_512_align2, implicit $mode, implicit $exec
+ %84:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub11:vreg_512_align2, implicit $mode, implicit $exec
+ %85:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub12:vreg_512_align2, implicit $mode, implicit $exec
+ %86:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub13:vreg_512_align2, implicit $mode, implicit $exec
+ %87:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub14:vreg_512_align2, implicit $mode, implicit $exec
+ %88:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub15:vreg_512_align2, implicit $mode, implicit $exec
+ %89:vgpr_32 = V_MAX3_F32_e64 0, %57:vgpr_32, 0, %11:sgpr_32, 0, %58:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %90:vgpr_32 = V_MAX3_F32_e64 0, %89:vgpr_32, 0, %59:vgpr_32, 0, %60:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %91:vgpr_32 = V_MAX3_F32_e64 0, %90:vgpr_32, 0, %61:vgpr_32, 0, %62:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %92:vgpr_32 = V_MAX3_F32_e64 0, %91:vgpr_32, 0, %63:vgpr_32, 0, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %93:vgpr_32 = V_MAX3_F32_e64 0, %92:vgpr_32, 0, %65:vgpr_32, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %94:vgpr_32 = V_MAX3_F32_e64 0, %93:vgpr_32, 0, %67:vgpr_32, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %95:vgpr_32 = V_MAX3_F32_e64 0, %94:vgpr_32, 0, %69:vgpr_32, 0, %70:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %96:vgpr_32 = V_MAX3_F32_e64 0, %95:vgpr_32, 0, %71:vgpr_32, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %97:vgpr_32 = V_MAX3_F32_e64 0, %96:vgpr_32, 0, %73:vgpr_32, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %98:vgpr_32 = V_MAX3_F32_e64 0, %97:vgpr_32, 0, %75:vgpr_32, 0, %76:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %99:vgpr_32 = V_MAX3_F32_e64 0, %98:vgpr_32, 0, %77:vgpr_32, 0, %78:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %100:vgpr_32 = V_MAX3_F32_e64 0, %99:vgpr_32, 0, %79:vgpr_32, 0, %80:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %101:vgpr_32 = V_MAX3_F32_e64 0, %100:vgpr_32, 0, %81:vgpr_32, 0, %82:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %102:vgpr_32 = V_MAX3_F32_e64 0, %101:vgpr_32, 0, %83:vgpr_32, 0, %84:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %103:vgpr_32 = V_MAX3_F32_e64 0, %102:vgpr_32, 0, %85:vgpr_32, 0, %86:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %104:vgpr_32 = V_MAX3_F32_e64 0, %103:vgpr_32, 0, %87:vgpr_32, 0, %88:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %105:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %104:vgpr_32, 0, implicit $exec
+ %107:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %105:vgpr_32, %105:vgpr_32, implicit $mode, implicit $exec
+ %108:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %104:vgpr_32, %107:vgpr_32, implicit $mode, implicit $exec
+ %109:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %108:vgpr_32, 0, implicit $exec
+ %110:vgpr_32 = V_CNDMASK_B32_e64 0, %109:vgpr_32, 0, %108:vgpr_32, %12:sreg_64_xexec, implicit $exec
+ %111:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %110:vgpr_32, %110:vgpr_32, implicit $mode, implicit $exec
+ %112:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %13:vgpr_32, %13:vgpr_32, implicit $mode, implicit $exec
+ %113:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %112:vgpr_32, %111:vgpr_32, implicit $mode, implicit $exec
+ %114:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %115:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %114:vgpr_32, implicit $mode, implicit $exec
+ %116:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %115:vgpr_32, implicit $mode, implicit $exec
+ %117:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %118:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %117:vgpr_32, implicit $mode, implicit $exec
+ %119:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %118:vgpr_32, implicit $mode, implicit $exec
+ %120:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %121:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %120:vgpr_32, implicit $mode, implicit $exec
+ %122:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %121:vgpr_32, implicit $mode, implicit $exec
+ %123:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %124:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %123:vgpr_32, implicit $mode, implicit $exec
+ %125:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %124:vgpr_32, implicit $mode, implicit $exec
+ %126:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %127:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %126:vgpr_32, implicit $mode, implicit $exec
+ %128:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %127:vgpr_32, implicit $mode, implicit $exec
+ %129:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %130:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %129:vgpr_32, implicit $mode, implicit $exec
+ %131:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %130:vgpr_32, implicit $mode, implicit $exec
+ %132:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %133:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %132:vgpr_32, implicit $mode, implicit $exec
+ %134:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %133:vgpr_32, implicit $mode, implicit $exec
+ %135:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %136:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %135:vgpr_32, implicit $mode, implicit $exec
+ %137:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %136:vgpr_32, implicit $mode, implicit $exec
+ %138:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %139:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %138:vgpr_32, implicit $mode, implicit $exec
+ %140:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %139:vgpr_32, implicit $mode, implicit $exec
+ %141:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %142:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %141:vgpr_32, implicit $mode, implicit $exec
+ %143:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %142:vgpr_32, implicit $mode, implicit $exec
+ %144:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %145:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %144:vgpr_32, implicit $mode, implicit $exec
+ %146:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %145:vgpr_32, implicit $mode, implicit $exec
+ %147:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %148:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %147:vgpr_32, implicit $mode, implicit $exec
+ %149:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %148:vgpr_32, implicit $mode, implicit $exec
+ %150:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %151:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %150:vgpr_32, implicit $mode, implicit $exec
+ %152:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %151:vgpr_32, implicit $mode, implicit $exec
+ %153:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %154:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %153:vgpr_32, implicit $mode, implicit $exec
+ %155:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %154:vgpr_32, implicit $mode, implicit $exec
+ %156:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %156:vgpr_32, implicit $mode, implicit $exec
+ %158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %157:vgpr_32, implicit $mode, implicit $exec
+ %159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %159:vgpr_32, implicit $mode, implicit $exec
+ %161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %160:vgpr_32, implicit $mode, implicit $exec
+ %162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %162:vgpr_32, implicit $mode, implicit $exec
+ %164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %163:vgpr_32, implicit $mode, implicit $exec
+ %165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %165:vgpr_32, implicit $mode, implicit $exec
+ %167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %166:vgpr_32, implicit $mode, implicit $exec
+ %168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %168:vgpr_32, implicit $mode, implicit $exec
+ %170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %169:vgpr_32, implicit $mode, implicit $exec
+ %171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %171:vgpr_32, implicit $mode, implicit $exec
+ %173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %172:vgpr_32, implicit $mode, implicit $exec
+ %174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %174:vgpr_32, implicit $mode, implicit $exec
+ %176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %175:vgpr_32, implicit $mode, implicit $exec
+ %177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %177:vgpr_32, implicit $mode, implicit $exec
+ %179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %178:vgpr_32, implicit $mode, implicit $exec
+ %180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %180:vgpr_32, implicit $mode, implicit $exec
+ %182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %181:vgpr_32, implicit $mode, implicit $exec
+ %183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %183:vgpr_32, implicit $mode, implicit $exec
+ %185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %184:vgpr_32, implicit $mode, implicit $exec
+ %186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %186:vgpr_32, implicit $mode, implicit $exec
+ %188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %187:vgpr_32, implicit $mode, implicit $exec
+ %189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %189:vgpr_32, implicit $mode, implicit $exec
+ %191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %190:vgpr_32, implicit $mode, implicit $exec
+ %192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %192:vgpr_32, implicit $mode, implicit $exec
+ %194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %193:vgpr_32, implicit $mode, implicit $exec
+ %195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %195:vgpr_32, implicit $mode, implicit $exec
+ %197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %196:vgpr_32, implicit $mode, implicit $exec
+ %198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %198:vgpr_32, implicit $mode, implicit $exec
+ %200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %199:vgpr_32, implicit $mode, implicit $exec
+ %201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %201:vgpr_32, implicit $mode, implicit $exec
+ %203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %202:vgpr_32, implicit $mode, implicit $exec
+ %204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %204:vgpr_32, implicit $mode, implicit $exec
+ %206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %205:vgpr_32, implicit $mode, implicit $exec
+ %207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %207:vgpr_32, implicit $mode, implicit $exec
+ %209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %208:vgpr_32, implicit $mode, implicit $exec
+ %210:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %116:vgpr_32, implicit $mode, implicit $exec
+ %211:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %119:vgpr_32, %210:vgpr_32, implicit $mode, implicit $exec
+ %212:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %122:vgpr_32, %211:vgpr_32, implicit $mode, implicit $exec
+ %213:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %125:vgpr_32, %212:vgpr_32, implicit $mode, implicit $exec
+ %214:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %128:vgpr_32, %213:vgpr_32, implicit $mode, implicit $exec
+ %215:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %131:vgpr_32, %214:vgpr_32, implicit $mode, implicit $exec
+ %216:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %134:vgpr_32, %215:vgpr_32, implicit $mode, implicit $exec
+ %217:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %137:vgpr_32, %216:vgpr_32, implicit $mode, implicit $exec
+ %218:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %140:vgpr_32, %217:vgpr_32, implicit $mode, implicit $exec
+ %219:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %143:vgpr_32, %218:vgpr_32, implicit $mode, implicit $exec
+ %220:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %146:vgpr_32, %219:vgpr_32, implicit $mode, implicit $exec
+ %221:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %149:vgpr_32, %220:vgpr_32, implicit $mode, implicit $exec
+ %222:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %152:vgpr_32, %221:vgpr_32, implicit $mode, implicit $exec
+ %223:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %155:vgpr_32, %222:vgpr_32, implicit $mode, implicit $exec
+ %224:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %158:vgpr_32, %223:vgpr_32, implicit $mode, implicit $exec
+ %225:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %161:vgpr_32, %224:vgpr_32, implicit $mode, implicit $exec
+ %226:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %164:vgpr_32, %225:vgpr_32, implicit $mode, implicit $exec
+ %227:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %167:vgpr_32, %226:vgpr_32, implicit $mode, implicit $exec
+ %228:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %170:vgpr_32, %227:vgpr_32, implicit $mode, implicit $exec
+ %229:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %173:vgpr_32, %228:vgpr_32, implicit $mode, implicit $exec
+ %230:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %176:vgpr_32, %229:vgpr_32, implicit $mode, implicit $exec
+ %231:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %179:vgpr_32, %230:vgpr_32, implicit $mode, implicit $exec
+ %232:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %182:vgpr_32, %231:vgpr_32, implicit $mode, implicit $exec
+ %233:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %185:vgpr_32, %232:vgpr_32, implicit $mode, implicit $exec
+ %234:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %188:vgpr_32, %233:vgpr_32, implicit $mode, implicit $exec
+ %235:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %191:vgpr_32, %234:vgpr_32, implicit $mode, implicit $exec
+ %236:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %194:vgpr_32, %235:vgpr_32, implicit $mode, implicit $exec
+ %237:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %197:vgpr_32, %236:vgpr_32, implicit $mode, implicit $exec
+ %238:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %200:vgpr_32, %237:vgpr_32, implicit $mode, implicit $exec
+ %239:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %203:vgpr_32, %238:vgpr_32, implicit $mode, implicit $exec
+ %240:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %206:vgpr_32, %239:vgpr_32, implicit $mode, implicit $exec
+ %241:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %209:vgpr_32, %240:vgpr_32, implicit $mode, implicit $exec
+ %242:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %241:vgpr_32, 0, implicit $exec
+ %243:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %241:vgpr_32, %242:vgpr_32, implicit $mode, implicit $exec
+ %244:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %243:vgpr_32, 0, implicit $exec
+ %0:vgpr_32 = V_CNDMASK_B32_e64 0, %244:vgpr_32, 0, %243:vgpr_32, %12:sreg_64_xexec, implicit $exec
+ %245:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %13:vgpr_32, %113:vgpr_32, implicit $mode, implicit $exec
+ %246:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %245:vgpr_32, implicit $mode, implicit $exec
+ undef %247.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %246:vgpr_32, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %248:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+ %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+ %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+ %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+ %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec
+ %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec
+ %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec
+ %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec
+ DS_WRITE_B32_gfx9 %255:vgpr_32, %248:vgpr_32, 0, 0, implicit $exec
+ %256:vgpr_32 = V_LSHL_ADD_U32_e64 %19:vgpr_32, 1, %255:vgpr_32, implicit $exec
+ DS_WRITE_B32_gfx9 %256:vgpr_32, %249:vgpr_32, 0, 0, implicit $exec
+ %257:vgpr_32 = V_LSHL_ADD_U32_e64 %20:vgpr_32, 1, %256:vgpr_32, implicit $exec
+ DS_WRITE_B32_gfx9 %257:vgpr_32, %250:vgpr_32, 0, 0, implicit $exec
+ %258:vgpr_32 = V_LSHL_ADD_U32_e64 %21:vgpr_32, 1, %257:vgpr_32, implicit $exec
+ DS_WRITE_B32_gfx9 %258:vgpr_32, %251:vgpr_32, 0, 0, implicit $exec
+ %0:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %1:vgpr_32, %247.sub0:vreg_64_align2, %0:vgpr_32, implicit $mode, implicit $exec
+ %259.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %259.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %260.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %261:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %116:vgpr_32, implicit $mode, implicit $exec
+ %262:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %119:vgpr_32, implicit $mode, implicit $exec
+ %263:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %122:vgpr_32, implicit $mode, implicit $exec
+ %264:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %128:vgpr_32, implicit $mode, implicit $exec
+ %265:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %131:vgpr_32, implicit $mode, implicit $exec
+ %266:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %134:vgpr_32, implicit $mode, implicit $exec
+ %267:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %140:vgpr_32, implicit $mode, implicit $exec
+ %268:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %143:vgpr_32, implicit $mode, implicit $exec
+ %269:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %146:vgpr_32, implicit $mode, implicit $exec
+ %270:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %152:vgpr_32, implicit $mode, implicit $exec
+ %271:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %155:vgpr_32, implicit $mode, implicit $exec
+ %272:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %158:vgpr_32, implicit $mode, implicit $exec
+ %273:vgpr_32 = V_ADD_U32_e32 %22:vgpr_32, %37:vgpr_32, implicit $exec
+ %274:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %273:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+ %275:vgpr_32 = V_ADD_U32_e32 %23:vgpr_32, %37:vgpr_32, implicit $exec
+ %276:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %275:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %277:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %278:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+ %279:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+ %280:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %281:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+ %282:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+ %283:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+ %284:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+ DS_WRITE_B32_gfx9 %255:vgpr_32, %281:vgpr_32, 0, 0, implicit $exec
+ DS_WRITE_B32_gfx9 %256:vgpr_32, %282:vgpr_32, 0, 0, implicit $exec
+ DS_WRITE_B32_gfx9 %257:vgpr_32, %283:vgpr_32, 0, 0, implicit $exec
+ DS_WRITE_B32_gfx9 %258:vgpr_32, %284:vgpr_32, 0, 0, implicit $exec
+ %285:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %125:vgpr_32, implicit $mode, implicit $exec
+ %286:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %137:vgpr_32, implicit $mode, implicit $exec
+ %287:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %149:vgpr_32, implicit $mode, implicit $exec
+ %288:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %161:vgpr_32, implicit $mode, implicit $exec
+ undef %289.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %263:vgpr_32, 0, %285:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %289.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %261:vgpr_32, 0, %262:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %290.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %266:vgpr_32, 0, %286:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %290.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %264:vgpr_32, 0, %265:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %291.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %269:vgpr_32, 0, %287:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %291.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %267:vgpr_32, 0, %268:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %292.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %272:vgpr_32, 0, %288:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %292.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %270:vgpr_32, 0, %271:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %293:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %209:vgpr_32, implicit $mode, implicit $exec
+ %294:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %203:vgpr_32, implicit $mode, implicit $exec
+ %295:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %197:vgpr_32, implicit $mode, implicit $exec
+ %296:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %191:vgpr_32, implicit $mode, implicit $exec
+ %297:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %185:vgpr_32, implicit $mode, implicit $exec
+ %298:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %179:vgpr_32, implicit $mode, implicit $exec
+ %299:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %173:vgpr_32, implicit $mode, implicit $exec
+ %300:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %167:vgpr_32, implicit $mode, implicit $exec
+ %301:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %206:vgpr_32, implicit $mode, implicit $exec
+ %302:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %200:vgpr_32, implicit $mode, implicit $exec
+ %303:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %194:vgpr_32, implicit $mode, implicit $exec
+ %304:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %188:vgpr_32, implicit $mode, implicit $exec
+ %305:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %182:vgpr_32, implicit $mode, implicit $exec
+ %306:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %176:vgpr_32, implicit $mode, implicit $exec
+ %307:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %170:vgpr_32, implicit $mode, implicit $exec
+ %308:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %164:vgpr_32, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ undef %309.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %307:vgpr_32, 0, %299:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %309.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %308:vgpr_32, 0, %300:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %310.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %305:vgpr_32, 0, %297:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %310.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %306:vgpr_32, 0, %298:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %311.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %303:vgpr_32, 0, %295:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %311.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %304:vgpr_32, 0, %296:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ undef %312.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %301:vgpr_32, 0, %293:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %312.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %302:vgpr_32, 0, %294:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %313:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %314:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %315:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %316:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+ %37:vgpr_32 = V_ADD_U32_e32 %26:sreg_32, %37:vgpr_32, implicit $exec
+ %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec
+ S_ENDPGM 0
+...
+
+
More information about the llvm-commits
mailing list