[llvm] [AMDGPU] Add scheduling strategy for critical resource in remainder (PR #184657)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 11:06:38 PST 2026
https://github.com/lijinpei-amd updated https://github.com/llvm/llvm-project/pull/184657
>From 36e9a4031e7dbfef548fcbcddb87c992bfcf6514 Mon Sep 17 00:00:00 2001
From: Li Jinpei <jinpli at amd.com>
Date: Wed, 4 Mar 2026 15:07:39 +0800
Subject: [PATCH] [NFC][MISched][AMDGPU] Expose tracePick to be used in AMDGPU
backend
- Fix wrong IsTopNode for `PostGenericScheduler::pickNode`.
- Add tracePick to pre-ra pickOnlyChoice.
- Applies eaff28c9 to GCNSchedStrategy
- Fix in-accurate GCNSchedStrategy::tryCandidate reason.
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 6 +
llvm/lib/CodeGen/MachineScheduler.cpp | 19 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 11 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 339 ++++++-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 53 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 792 +++++++--------
.../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 280 +++---
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 932 +++++++++---------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 358 ++++---
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 38 +-
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 493 ++++-----
.../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll | 90 +-
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 464 +++++----
.../AMDGPU/mfma-no-register-aliasing.ll | 12 +-
.../CodeGen/AMDGPU/misched-remat-revert.ll | 190 ++--
.../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 14 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 60 +-
17 files changed, 2232 insertions(+), 1919 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 33036030679e5..65d20f5192765 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1204,6 +1204,12 @@ class GenericSchedulerBase : public MachineSchedStrategy {
const TargetSchedModel *SchedModel);
};
+ static void tracePick(CandReason Reason, bool IsTop, bool IsPostRA = false);
+
+ static void tracePick(const SchedCandidate &Cand, bool IsPostRA = false) {
+ tracePick(Cand.Reason, Cand.AtTop, IsPostRA);
+ }
+
protected:
const MachineSchedContext *Context;
const TargetSchedModel *SchedModel = nullptr;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 6697c0a110dc3..4fff90a81fc38 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3505,8 +3505,8 @@ bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
return false;
}
-static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
- bool IsPostRA = false) {
+void GenericSchedulerBase::tracePick(CandReason Reason, bool IsTop,
+ bool IsPostRA) {
LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
<< GenericSchedulerBase::getReasonStr(Reason) << " ["
<< (IsPostRA ? "post-RA" : "pre-RA") << "]\n");
@@ -3633,11 +3633,6 @@ static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
llvm_unreachable("Unknown reason!");
}
-static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
- bool IsPostRA = false) {
- tracePick(Cand.Reason, Cand.AtTop, IsPostRA);
-}
-
void GenericScheduler::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() &&
"(PreRA)GenericScheduler needs vreg liveness");
@@ -4146,7 +4141,9 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
SUnit *SU;
if (RegionPolicy.OnlyTopDown) {
SU = Top.pickOnlyChoice();
- if (!SU) {
+ if (SU) {
+ tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/false);
+ } else {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
@@ -4157,7 +4154,9 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {
SU = Bot.pickOnlyChoice();
- if (!SU) {
+ if (SU) {
+ tracePick(Only1, /*IsTopNode=*/false, /*IsPostRA=*/false);
+ } else {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
@@ -4513,7 +4512,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
if (RegionPolicy.OnlyBottomUp) {
SU = Bot.pickOnlyChoice();
if (SU) {
- tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/true);
+ tracePick(Only1, /*IsTopNode=*/false, /*IsPostRA=*/true);
} else {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5b3effbcc7179..92e6ee262f87a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1267,14 +1267,19 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
ScheduleDAGInstrs *
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
+ auto PostSchedulerPtr = std::make_unique<GCNPostSchedStrategy>(C);
+ auto *PostScheduler = PostSchedulerPtr.get();
ScheduleDAGMI *DAG =
- new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
- /*RemoveKillFlags=*/true);
+ new GCNPostScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
+ /*RemoveKillFlags=*/true);
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+ auto IGLPMutation =
+ createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA);
+ PostScheduler->setIGLPDAGMutation(IGLPMutation.get());
+ DAG->addMutation(std::move(IGLPMutation));
if ((EnableVOPD.getNumOccurrences() ||
getOptLevel() >= CodeGenOptLevel::Less) &&
EnableVOPD)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 77c322eb3178e..76b1ccf44fc38 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -466,10 +466,12 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
IsTopNode = false;
+ tracePick(Only1, /*IsTopNode=*/false);
return SU;
}
if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
IsTopNode = true;
+ tracePick(Only1, /*IsTopNode=*/true);
return SU;
}
// Set the bottom-up policy based on the state of the current bottom zone
@@ -552,6 +554,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
+ tracePick(Cand);
return Cand.SU;
}
@@ -565,36 +568,41 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
}
bool PickedPending;
SUnit *SU;
- do {
- PickedPending = false;
- if (RegionPolicy.OnlyTopDown) {
- SU = pickOnlyChoice(Top, SchedModel);
- if (!SU) {
- CandPolicy NoPolicy;
- TopCand.reset(NoPolicy);
- pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
- PickedPending,
- /*IsBottomUp=*/false);
- assert(TopCand.Reason != NoCand && "failed to find a candidate");
- SU = TopCand.SU;
- }
- IsTopNode = true;
- } else if (RegionPolicy.OnlyBottomUp) {
- SU = pickOnlyChoice(Bot, SchedModel);
- if (!SU) {
- CandPolicy NoPolicy;
- BotCand.reset(NoPolicy);
- pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
- PickedPending,
- /*IsBottomUp=*/true);
- assert(BotCand.Reason != NoCand && "failed to find a candidate");
- SU = BotCand.SU;
- }
- IsTopNode = false;
+ PickedPending = false;
+ if (RegionPolicy.OnlyTopDown) {
+ SU = pickOnlyChoice(Top, SchedModel);
+ if (SU) {
+ tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/false);
+ } else {
+ CandPolicy NoPolicy;
+ TopCand.reset(NoPolicy);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ PickedPending,
+ /*IsBottomUp=*/false);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ tracePick(TopCand);
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } else if (RegionPolicy.OnlyBottomUp) {
+ SU = pickOnlyChoice(Bot, SchedModel);
+ if (SU) {
+ tracePick(Only1, /*IsTopNode=*/false, /*IsPostRA=*/false);
} else {
- SU = pickNodeBidirectional(IsTopNode, PickedPending);
+ CandPolicy NoPolicy;
+ BotCand.reset(NoPolicy);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ PickedPending,
+ /*IsBottomUp=*/true);
+ assert(BotCand.Reason != NoCand && "failed to find a candidate");
+ tracePick(TopCand);
+ SU = BotCand.SU;
}
- } while (SU->isScheduled);
+ IsTopNode = false;
+ } else {
+ SU = pickNodeBidirectional(IsTopNode, PickedPending);
+ }
+ assert(!SU->isScheduled && "SUnit scheduled twice.");
if (PickedPending) {
unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
@@ -622,6 +630,40 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
return SU;
}
+static unsigned
+countCriticalResourceInRemainder(const SchedRemainder &Rem,
+ const TargetSchedModel *SchedModel) {
+ if (!SchedModel || !SchedModel->hasInstrSchedModel()) {
+ return 0;
+ }
+ unsigned CriticalResIdx = 0;
+ unsigned CriticalResCount = 0;
+ for (unsigned I = 1, E = SchedModel->getNumProcResourceKinds(); I < E; ++I) {
+ if (SchedModel->getResourceBufferSize(I) != 0) {
+ continue;
+ }
+ unsigned ResCount =
+ Rem.RemainingCounts[I] * SchedModel->getResourceFactor(I);
+ if (ResCount > CriticalResCount) {
+ CriticalResIdx = I;
+ CriticalResCount = ResCount;
+ }
+ }
+ if (CriticalResIdx) {
+ unsigned LatencyPath =
+ Rem.IsAcyclicLatencyLimited ? Rem.CriticalPath : Rem.CyclicCritPath;
+ if (LatencyPath * SchedModel->getLatencyFactor() < CriticalResCount)
+ return CriticalResIdx;
+ }
+ return 0;
+}
+
+void GCNSchedStrategy::updateRemainderCriticalRes() {
+ RemCriticalRes = TrackRemCriticalRes
+ ? countCriticalResourceInRemainder(Rem, SchedModel)
+ : 0;
+}
+
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
@@ -629,7 +671,8 @@ void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
: UpwardTracker.recede(*MI);
}
- return GenericScheduler::schedNode(SU, IsTopNode);
+ GenericScheduler::schedNode(SU, IsTopNode);
+ updateRemainderCriticalRes();
}
GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
@@ -710,6 +753,139 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
GCNTrackers = GCNTrackers & !IsLegacyScheduler;
}
+static unsigned getResourceUseCount(unsigned ResId, const MCSchedClassDesc *SC,
+ const TargetSchedModel *SchedModel) {
+ if (!SC) {
+ return 0;
+ }
+ unsigned Count = 0;
+ for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC);
+ PI != PE; ++PI) {
+ if (PI->ProcResourceIdx != ResId) {
+ continue;
+ }
+ Count += PI->ReleaseAtCycle - PI->AcquireAtCycle;
+ }
+ return Count;
+}
+
+bool GCNMaxOccupancySchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = FirstValid;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary. Some properties are simply incomparable, in many
+ // other instances we should only override the other boundary if something
+ // is a clear good pick on one boundary. Skip heuristics that are more
+ // "tie-breaking" in nature.
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // For loops that are acyclic path limited, aggressively schedule for
+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+ // heuristics to take precedence.
+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+ tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+
+ if (RemCriticalRes) {
+
+ // Prioritize instructions that use critical resource
+ if (tryGreater(getResourceUseCount(RemCriticalRes,
+ DAG->getSchedClass(TryCand.SU),
+ SchedModel),
+ getResourceUseCount(RemCriticalRes,
+ DAG->getSchedClass(Cand.SU), SchedModel),
+ TryCand, Cand, ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Keep clustered nodes together to encourage downstream peephole
+ // optimizations which may reduce resource requirements.
+ //
+ // This is a best effort to set things up for a post-RA pass. Optimizations
+ // like generating loads of multiple registers should ideally be done within
+ // the scheduler pass by combining the loads during DAG postprocessing.
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
+ Cluster))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ }
+
+ return false;
+}
+
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
@@ -720,7 +896,7 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
- TryCand.Reason = NodeOrder;
+ TryCand.Reason = FirstValid;
return true;
}
@@ -796,8 +972,12 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
return true;
+ } else {
+ Cand.Reason = NodeOrder;
+ return false;
}
}
+ Cand.Reason = FirstValid;
return false;
}
@@ -822,7 +1002,7 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
- TryCand.Reason = NodeOrder;
+ TryCand.Reason = FirstValid;
return true;
}
@@ -929,12 +1109,98 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
TryCand.Reason = NodeOrder;
return true;
+ } else {
+ Cand.Reason = NodeOrder;
+ return false;
}
}
+ Cand.Reason = FirstValid;
return false;
}
+void GCNPostSchedStrategy::updateRemainderCriticalRes() {
+ RemCriticalRes = TrackRemCriticalRes
+ ? countCriticalResourceInRemainder(Rem, SchedModel)
+ : 0;
+}
+
+bool GCNPostSchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand) {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = FirstValid;
+ return true;
+ }
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+ Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+
+ if (RemCriticalRes) {
+
+ // Prioritize instructions that use critical resource
+ if (tryGreater(getResourceUseCount(RemCriticalRes,
+ DAG->getSchedClass(TryCand.SU),
+ SchedModel),
+ getResourceUseCount(RemCriticalRes,
+ DAG->getSchedClass(Cand.SU), SchedModel),
+ TryCand, Cand, ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Keep clustered nodes together.
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
+ Cluster))
+ return TryCand.Reason != NoCand;
+ // Avoid critical resource consumption and balance the schedule.
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary.
+ if (Cand.AtTop == TryCand.AtTop) {
+ // Avoid serializing long latency dependence chains.
+ if (Cand.Policy.ReduceLatency &&
+ tryLatency(TryCand, Cand, Cand.AtTop ? Top : Bot))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ return false;
+}
+
+void GCNPostSchedStrategy::initialize(ScheduleDAGMI *Dag) {
+ PostGenericScheduler::initialize(Dag);
+ setTrackRemainderCriticalRes(
+ Context->MF->getSubtarget<GCNSubtarget>(),
+ !static_cast<GCNPostScheduleDAGMI *>(Dag)->hasIGLPInstrs());
+ updateRemainderCriticalRes();
+}
+
+void GCNPostSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ PostGenericScheduler::schedNode(SU, IsTopNode);
+ updateRemainderCriticalRes();
+}
+
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -1733,6 +1999,9 @@ bool GCNSchedStage::initGCNRegion() {
: AMDGPU::SchedulingPhase::PreRAReentry));
}
+ S.setTrackRemainderCriticalRes(ST, !DAG.RegionsWithIGLPInstrs[RegionIdx]);
+ S.updateRemainderCriticalRes();
+
return true;
}
@@ -3203,13 +3472,13 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
});
}
-GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
+GCNPostScheduleDAGMI::GCNPostScheduleDAGMI(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
bool RemoveKillFlags)
: ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
-void GCNPostScheduleDAGMILive::schedule() {
- HasIGLPInstrs = hasIGLPInstrs(this);
+void GCNPostScheduleDAGMI::schedule() {
+ HasIGLPInstrs = ::hasIGLPInstrs(this);
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
@@ -3219,7 +3488,7 @@ void GCNPostScheduleDAGMILive::schedule() {
ScheduleDAGMI::schedule();
}
-void GCNPostScheduleDAGMILive::finalizeSchedule() {
+void GCNPostScheduleDAGMI::finalizeSchedule() {
if (HasIGLPInstrs)
SavedMutations.swap(Mutations);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 99fd55db33285..4ae2dd6e8c886 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -126,6 +126,10 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRLimitBias = 0;
+ bool TrackRemCriticalRes;
+
+ unsigned RemCriticalRes;
+
GCNSchedStrategy(const MachineSchedContext *C);
SUnit *pickNode(bool &IsTopNode) override;
@@ -150,11 +154,21 @@ class GCNSchedStrategy : public GenericScheduler {
GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
+
+ void setTrackRemainderCriticalRes(const GCNSubtarget &ST, bool B) {
+ TrackRemCriticalRes = B && ST.hasGFX940Insts();
+ }
+
+ void updateRemainderCriticalRes();
};
/// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
/// maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+
public:
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
bool IsLegacyScheduler = false);
@@ -182,6 +196,35 @@ class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
};
+class GCNPostSchedStrategy : public PostGenericScheduler {
+ ScheduleDAGMutation *IGLPMutation = nullptr;
+
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
+
+public:
+ GCNPostSchedStrategy(const MachineSchedContext *C)
+ : PostGenericScheduler(C) {}
+
+ void initialize(ScheduleDAGMI *Dag) override;
+
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+
+ bool TrackRemCriticalRes;
+
+ unsigned RemCriticalRes;
+
+ void setIGLPDAGMutation(ScheduleDAGMutation *Mutation) {
+ IGLPMutation = Mutation;
+ }
+
+ void setTrackRemainderCriticalRes(const GCNSubtarget &ST, bool B) {
+ TrackRemCriticalRes = B && ST.hasGFX940Insts();
+ }
+
+ void updateRemainderCriticalRes();
+};
+
class ScheduleMetrics {
unsigned ScheduleLength;
unsigned BubbleCycles;
@@ -783,7 +826,7 @@ class MemoryClauseInitialScheduleStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};
-class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
+class GCNPostScheduleDAGMI final : public ScheduleDAGMI {
private:
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
@@ -794,9 +837,11 @@ class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
void finalizeSchedule() override;
- GCNPostScheduleDAGMILive(MachineSchedContext *C,
- std::unique_ptr<MachineSchedStrategy> S,
- bool RemoveKillFlags);
+ GCNPostScheduleDAGMI(MachineSchedContext *C,
+ std::unique_ptr<MachineSchedStrategy> S,
+ bool RemoveKillFlags);
+
+ bool hasIGLPInstrs() const { return HasIGLPInstrs; }
};
} // End namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 700fc6db8549f..f508820f251f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -59,14 +59,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
@@ -97,14 +97,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -157,12 +157,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_i32_32x32x16i8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -173,9 +174,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -191,6 +191,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -202,23 +203,23 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_i32_32x32x16i8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -229,9 +230,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -247,6 +247,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -258,12 +259,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
@@ -301,14 +301,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
@@ -339,14 +339,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -424,14 +424,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
@@ -462,14 +462,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -547,14 +547,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
@@ -585,14 +585,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -670,14 +670,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
@@ -708,14 +708,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -768,12 +768,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -784,9 +785,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -802,6 +802,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -813,23 +814,23 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -840,9 +841,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -858,6 +858,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -869,12 +870,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -887,12 +887,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -903,9 +904,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -921,6 +921,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -932,23 +933,23 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -959,9 +960,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -977,6 +977,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -988,12 +989,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1006,12 +1006,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1022,9 +1023,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1040,6 +1040,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1051,23 +1052,23 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1078,9 +1079,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1096,6 +1096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1107,12 +1108,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1125,12 +1125,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1141,9 +1142,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1159,6 +1159,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1170,23 +1171,23 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1197,9 +1198,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1215,6 +1215,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1226,12 +1227,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1264,21 +1264,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1304,21 +1304,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1376,12 +1376,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1392,9 +1393,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1405,12 +1405,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1421,25 +1422,25 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v23 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1450,9 +1451,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1463,12 +1463,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1479,13 +1480,12 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v23 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -1666,21 +1666,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1706,21 +1706,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1778,12 +1778,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1794,9 +1795,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1807,12 +1807,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1823,25 +1824,25 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v23 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1852,9 +1853,8 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1865,12 +1865,13 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1881,13 +1882,12 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v23 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2070,26 +2070,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
;
; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
; GFX942-GISEL: ; %bb.0: ; %bb
-; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2117,26 +2117,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
;
; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
; GFX950-GISEL: ; %bb.0: ; %bb
-; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2260,6 +2260,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2270,7 +2271,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2279,9 +2280,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -2294,6 +2294,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2313,18 +2314,18 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2335,7 +2336,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2344,9 +2345,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -2359,6 +2359,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2378,12 +2379,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2584,26 +2584,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
;
; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
-; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2631,26 +2631,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
;
; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
-; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2795,26 +2795,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
;
; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
-; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2842,26 +2842,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
;
; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
-; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3006,26 +3006,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
;
; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
-; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3053,26 +3053,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
;
; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
-; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3217,26 +3217,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
;
; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
-; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3264,26 +3264,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
;
; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
-; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: global_store_dwordx4 v7, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3407,6 +3407,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3417,7 +3418,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3426,9 +3427,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3441,6 +3441,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3460,18 +3461,18 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3482,7 +3483,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3491,9 +3492,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3506,6 +3506,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3525,12 +3526,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3710,6 +3710,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3720,7 +3721,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3729,9 +3730,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3744,6 +3744,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3763,18 +3764,18 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3785,7 +3786,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3794,9 +3795,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -3809,6 +3809,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3828,12 +3829,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -4013,6 +4013,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4023,7 +4024,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4032,9 +4033,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4047,6 +4047,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4066,18 +4067,18 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4088,7 +4089,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4097,9 +4098,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4112,6 +4112,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4131,12 +4132,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -4316,6 +4316,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4326,7 +4327,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4335,9 +4336,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: s_nop 10
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4350,6 +4350,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4369,18 +4370,18 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4391,7 +4392,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s22
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4400,9 +4401,8 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT: s_nop 10
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v17 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: s_nop 11
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
@@ -4415,6 +4415,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v23, 0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4434,12 +4435,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT: s_nop 11
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v23, v[12:15], s[24:25] offset:48
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 55c076375fac4..640112ce63573 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -682,55 +682,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; AGPR-NEXT: v_mov_b32_e32 v36, 0
+; AGPR-NEXT: v_mov_b32_e32 v32, 0
; AGPR-NEXT: s_waitcnt lgkmcnt(0)
-; AGPR-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; AGPR-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; AGPR-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; AGPR-NEXT: v_mov_b32_e32 v32, s20
-; AGPR-NEXT: v_mov_b32_e32 v33, s21
-; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31]
-; AGPR-NEXT: v_mov_b32_e32 v34, s22
-; AGPR-NEXT: v_mov_b32_e32 v35, s23
-; AGPR-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; AGPR-NEXT: v_mov_b32_e32 v16, s20
+; AGPR-NEXT: v_mov_b32_e32 v17, s21
+; AGPR-NEXT: v_mov_b32_e32 v18, s22
+; AGPR-NEXT: v_mov_b32_e32 v19, s23
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; AGPR-NEXT: v_mov_b32_e32 v16, s16
; AGPR-NEXT: v_mov_b32_e32 v17, s17
; AGPR-NEXT: v_mov_b32_e32 v18, s18
; AGPR-NEXT: v_mov_b32_e32 v19, s19
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; AGPR-NEXT: v_mov_b32_e32 v16, s12
; AGPR-NEXT: v_mov_b32_e32 v17, s13
; AGPR-NEXT: v_mov_b32_e32 v18, s14
; AGPR-NEXT: v_mov_b32_e32 v19, s15
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_nop 0
; AGPR-NEXT: v_mov_b32_e32 v16, s8
; AGPR-NEXT: v_mov_b32_e32 v17, s9
; AGPR-NEXT: v_mov_b32_e32 v18, s10
; AGPR-NEXT: v_mov_b32_e32 v19, s11
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[34:37], v[38:41], v[0:15]
+; AGPR-NEXT: s_nop 11
+; AGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
;
@@ -739,55 +739,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPR-NEXT: v_mov_b32_e32 v36, 0
+; VGPR-NEXT: v_mov_b32_e32 v32, 0
; VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; VGPR-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; VGPR-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPR-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPR-NEXT: v_mov_b32_e32 v32, s20
-; VGPR-NEXT: v_mov_b32_e32 v33, s21
-; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31]
-; VGPR-NEXT: v_mov_b32_e32 v34, s22
-; VGPR-NEXT: v_mov_b32_e32 v35, s23
-; VGPR-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; VGPR-NEXT: v_mov_b32_e32 v16, s16
; VGPR-NEXT: v_mov_b32_e32 v17, s17
; VGPR-NEXT: v_mov_b32_e32 v18, s18
; VGPR-NEXT: v_mov_b32_e32 v19, s19
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPR-NEXT: v_mov_b32_e32 v16, s12
; VGPR-NEXT: v_mov_b32_e32 v17, s13
; VGPR-NEXT: v_mov_b32_e32 v18, s14
; VGPR-NEXT: v_mov_b32_e32 v19, s15
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
; VGPR-NEXT: s_nop 0
; VGPR-NEXT: v_mov_b32_e32 v16, s8
; VGPR-NEXT: v_mov_b32_e32 v17, s9
; VGPR-NEXT: v_mov_b32_e32 v18, s10
; VGPR-NEXT: v_mov_b32_e32 v19, s11
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[34:37], v[38:41], v[0:15]
+; VGPR-NEXT: s_nop 11
+; VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -859,55 +859,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; AGPR-NEXT: v_mov_b32_e32 v36, 0
+; AGPR-NEXT: v_mov_b32_e32 v32, 0
; AGPR-NEXT: s_waitcnt lgkmcnt(0)
-; AGPR-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; AGPR-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; AGPR-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; AGPR-NEXT: v_mov_b32_e32 v32, s20
-; AGPR-NEXT: v_mov_b32_e32 v33, s21
-; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
-; AGPR-NEXT: v_mov_b32_e32 v34, s22
-; AGPR-NEXT: v_mov_b32_e32 v35, s23
-; AGPR-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; AGPR-NEXT: v_mov_b32_e32 v16, s20
+; AGPR-NEXT: v_mov_b32_e32 v17, s21
+; AGPR-NEXT: v_mov_b32_e32 v18, s22
+; AGPR-NEXT: v_mov_b32_e32 v19, s23
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; AGPR-NEXT: v_mov_b32_e32 v16, s16
; AGPR-NEXT: v_mov_b32_e32 v17, s17
; AGPR-NEXT: v_mov_b32_e32 v18, s18
; AGPR-NEXT: v_mov_b32_e32 v19, s19
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; AGPR-NEXT: v_mov_b32_e32 v16, s12
; AGPR-NEXT: v_mov_b32_e32 v17, s13
; AGPR-NEXT: v_mov_b32_e32 v18, s14
; AGPR-NEXT: v_mov_b32_e32 v19, s15
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_nop 0
; AGPR-NEXT: v_mov_b32_e32 v16, s8
; AGPR-NEXT: v_mov_b32_e32 v17, s9
; AGPR-NEXT: v_mov_b32_e32 v18, s10
; AGPR-NEXT: v_mov_b32_e32 v19, s11
-; AGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[34:37], v[38:41], v[0:15] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT: s_nop 11
+; AGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
-; AGPR-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
;
@@ -916,55 +916,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPR-NEXT: v_mov_b32_e32 v36, 0
+; VGPR-NEXT: v_mov_b32_e32 v32, 0
; VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; VGPR-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; VGPR-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPR-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPR-NEXT: v_mov_b32_e32 v32, s20
-; VGPR-NEXT: v_mov_b32_e32 v33, s21
-; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPR-NEXT: v_mov_b32_e32 v34, s22
-; VGPR-NEXT: v_mov_b32_e32 v35, s23
-; VGPR-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; VGPR-NEXT: v_mov_b32_e32 v16, s16
; VGPR-NEXT: v_mov_b32_e32 v17, s17
; VGPR-NEXT: v_mov_b32_e32 v18, s18
; VGPR-NEXT: v_mov_b32_e32 v19, s19
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPR-NEXT: v_mov_b32_e32 v16, s12
; VGPR-NEXT: v_mov_b32_e32 v17, s13
; VGPR-NEXT: v_mov_b32_e32 v18, s14
; VGPR-NEXT: v_mov_b32_e32 v19, s15
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
; VGPR-NEXT: s_nop 0
; VGPR-NEXT: v_mov_b32_e32 v16, s8
; VGPR-NEXT: v_mov_b32_e32 v17, s9
; VGPR-NEXT: v_mov_b32_e32 v18, s10
; VGPR-NEXT: v_mov_b32_e32 v19, s11
-; VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[34:37], v[38:41], v[0:15] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT: s_nop 11
+; VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
-; VGPR-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPR-NEXT: s_waitcnt vmcnt(0)
; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -1007,12 +1007,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v16, 0
; AGPR-NEXT: s_waitcnt lgkmcnt(0)
-; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -1021,9 +1022,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; AGPR-NEXT: s_nop 1
-; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
-; AGPR-NEXT: v_mov_b32_e32 v16, 0
-; AGPR-NEXT: s_nop 10
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[18:21], v[22:25], v[0:15]
+; AGPR-NEXT: s_nop 11
; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -1035,12 +1035,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
; VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -1049,9 +1050,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPR-NEXT: s_nop 1
-; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
-; VGPR-NEXT: v_mov_b32_e32 v16, 0
-; VGPR-NEXT: s_nop 10
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[18:21], v[22:25], v[0:15]
+; VGPR-NEXT: s_nop 11
; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -1096,12 +1096,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v16, 0
; AGPR-NEXT: s_waitcnt lgkmcnt(0)
-; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -1110,9 +1111,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; AGPR-NEXT: s_nop 1
-; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; AGPR-NEXT: v_mov_b32_e32 v16, 0
-; AGPR-NEXT: s_nop 10
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[18:21], v[22:25], v[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: s_nop 11
; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -1124,12 +1124,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
; VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -1138,9 +1139,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPR-NEXT: s_nop 1
-; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; VGPR-NEXT: v_mov_b32_e32 v16, 0
-; VGPR-NEXT: s_nop 10
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[18:21], v[22:25], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: s_nop 11
; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index a1fe463de1c54..1af506a716c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -140,6 +140,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: v_mov_b32_e32 v12, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -149,9 +150,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -259,6 +259,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: v_mov_b32_e32 v12, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -268,9 +269,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -1401,55 +1401,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v36, 0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1458,44 +1458,46 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v56, 0
+; GISEL-NEXT: v_mov_b32_e32 v40, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1504,55 +1506,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v36, 0
+; HEURRC-NEXT: v_mov_b32_e32 v32, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT: v_mov_b32_e32 v32, s20
-; HEURRC-NEXT: v_mov_b32_e32 v33, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v34, s22
-; HEURRC-NEXT: v_mov_b32_e32 v35, s23
-; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b32_e32 v16, s12
; HEURRC-NEXT: v_mov_b32_e32 v17, s13
; HEURRC-NEXT: v_mov_b32_e32 v18, s14
; HEURRC-NEXT: v_mov_b32_e32 v19, s15
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v16, s8
; HEURRC-NEXT: v_mov_b32_e32 v17, s9
; HEURRC-NEXT: v_mov_b32_e32 v18, s10
; HEURRC-NEXT: v_mov_b32_e32 v19, s11
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15]
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1561,55 +1563,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v36, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15]
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1743,55 +1745,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v36, 0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1800,44 +1802,46 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v56, 0
+; GISEL-NEXT: v_mov_b32_e32 v40, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1846,55 +1850,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v36, 0
+; HEURRC-NEXT: v_mov_b32_e32 v32, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT: v_mov_b32_e32 v32, s20
-; HEURRC-NEXT: v_mov_b32_e32 v33, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v34, s22
-; HEURRC-NEXT: v_mov_b32_e32 v35, s23
-; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b32_e32 v16, s12
; HEURRC-NEXT: v_mov_b32_e32 v17, s13
; HEURRC-NEXT: v_mov_b32_e32 v18, s14
; HEURRC-NEXT: v_mov_b32_e32 v19, s15
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v16, s8
; HEURRC-NEXT: v_mov_b32_e32 v17, s9
; HEURRC-NEXT: v_mov_b32_e32 v18, s10
; HEURRC-NEXT: v_mov_b32_e32 v19, s11
-; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1903,55 +1907,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v36, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31]
-; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29]
-; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25]
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[34:37], v[38:41], v[0:15] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -2085,12 +2089,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2099,9 +2104,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15]
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2113,6 +2117,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GISEL-NEXT: v_mov_b32_e32 v24, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
@@ -2128,12 +2133,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2141,12 +2145,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2155,9 +2160,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v16, 0
-; HEURRC-NEXT: s_nop 10
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15]
+; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2169,12 +2173,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2183,9 +2188,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT: s_nop 10
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15]
+; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2266,12 +2270,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2280,9 +2285,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2294,6 +2298,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GISEL-NEXT: v_mov_b32_e32 v24, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
@@ -2309,12 +2314,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2322,12 +2326,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2336,9 +2341,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v16, 0
-; HEURRC-NEXT: s_nop 10
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: s_nop 11
; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2350,12 +2354,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -2364,9 +2369,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT: s_nop 10
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[18:21], v[22:25], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: s_nop 11
; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -2582,6 +2586,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: v_mov_b32_e32 v12, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2591,9 +2596,8 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2729,6 +2733,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-NEXT: v_mov_b32_e32 v12, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -2738,9 +2743,8 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3976,35 +3980,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG-NEXT: v_mov_b32_e32 v38, s26
; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
@@ -4014,13 +4013,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4029,44 +4031,46 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v56, 0
+; GISEL-NEXT: v_mov_b32_e32 v40, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4086,35 +4090,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_mov_b32_e32 v38, s26
; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT: s_nop 6
; HEURRC-NEXT: v_mov_b32_e32 v16, s20
; HEURRC-NEXT: v_mov_b32_e32 v17, s21
; HEURRC-NEXT: v_mov_b32_e32 v18, s22
; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; HEURRC-NEXT: v_mov_b32_e32 v16, s12
; HEURRC-NEXT: v_mov_b32_e32 v17, s13
; HEURRC-NEXT: v_mov_b32_e32 v18, s14
; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
@@ -4124,13 +4123,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_mov_b32_e32 v19, s11
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4150,35 +4152,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
@@ -4188,13 +4185,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
@@ -4353,35 +4353,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG-NEXT: v_mov_b32_e32 v38, s26
; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: s_nop 6
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
@@ -4391,13 +4386,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG-NEXT: v_mov_b32_e32 v19, s11
; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4406,44 +4404,46 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v56, 0
+; GISEL-NEXT: v_mov_b32_e32 v40, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4463,35 +4463,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC-NEXT: v_mov_b32_e32 v38, s26
; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: s_nop 6
; HEURRC-NEXT: v_mov_b32_e32 v16, s20
; HEURRC-NEXT: v_mov_b32_e32 v17, s21
; HEURRC-NEXT: v_mov_b32_e32 v18, s22
; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; HEURRC-NEXT: v_mov_b32_e32 v16, s12
; HEURRC-NEXT: v_mov_b32_e32 v17, s13
; HEURRC-NEXT: v_mov_b32_e32 v18, s14
; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
@@ -4501,13 +4496,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC-NEXT: v_mov_b32_e32 v19, s11
; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4527,35 +4525,30 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
@@ -4565,13 +4558,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[24:27], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[28:31], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[20:23], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
@@ -4718,6 +4714,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
@@ -4739,12 +4736,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -4752,6 +4748,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GISEL-NEXT: v_mov_b32_e32 v24, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
@@ -4767,18 +4764,18 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v24, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v16, s20
; HEURRC-NEXT: v_mov_b32_e32 v17, s21
@@ -4800,18 +4797,18 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v16, 0
-; HEURRC-NEXT: s_nop 10
-; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v24, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
@@ -4833,12 +4830,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT: s_nop 10
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
; AGPR: ; %bb.0:
@@ -4924,6 +4920,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
@@ -4945,12 +4942,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -4958,6 +4954,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; GISEL-NEXT: v_mov_b32_e32 v24, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
@@ -4973,18 +4970,18 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v24, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v16, s20
; HEURRC-NEXT: v_mov_b32_e32 v17, s21
@@ -5006,18 +5003,18 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v16, 0
-; HEURRC-NEXT: s_nop 10
-; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; HEURRC-NEXT: s_nop 11
+; HEURRC-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v24, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
@@ -5039,12 +5036,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: s_nop 1
; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
-; VGPRRC-NEXT: s_nop 10
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_nop 11
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v24, v[0:3], s[0:1]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
; AGPR: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 0161cdf03deac..5bdb0c2edf717 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -630,7 +630,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -643,20 +644,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
@@ -669,13 +670,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: s_nop 8
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -964,7 +964,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -977,21 +978,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
@@ -1004,14 +1005,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_nop 15
-; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1775,15 +1775,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-LABEL: test_mfma_f32_16x16x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v16, s20
-; GFX942-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-NEXT: v_mov_b32_e32 v19, s21
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v18, s22
-; GFX942-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-NEXT: v_mov_b32_e32 v20, s22
+; GFX942-NEXT: v_mov_b32_e32 v21, s23
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -1792,9 +1793,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 10
; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -1804,15 +1804,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s21
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s23
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -1821,9 +1822,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 10
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2145,15 +2145,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-LABEL: test_mfma_f32_32x32x8f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v16, s20
-; GFX942-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-NEXT: v_mov_b32_e32 v19, s21
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v18, s22
-; GFX942-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-NEXT: v_mov_b32_e32 v20, s22
+; GFX942-NEXT: v_mov_b32_e32 v21, s23
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2162,9 +2163,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 10
; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2174,15 +2174,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s21
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s23
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2191,9 +2192,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[20:21], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 10
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
@@ -2923,7 +2923,8 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v16, 1
-; GFX942-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -2936,20 +2937,20 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-NEXT: s_nop 9
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 10
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
@@ -2962,13 +2963,12 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: s_nop 9
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v18, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 10
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
@@ -3075,33 +3075,31 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v16, 1
-; GFX942-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-NEXT: v_mov_b32_e32 v18, 2
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v18, 64 cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v18, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 8
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
@@ -3748,6 +3746,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v34, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -3787,17 +3786,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: global_store_dwordx4 v34, v[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v34, v[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v34, v[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v34, v[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v34, v[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v34, v[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v34, v[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v34, v[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
@@ -3805,6 +3803,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -3844,17 +3843,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
; GFX942-VGPR-NEXT: s_nop 15
-; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v34, v[4:7], s[34:35] offset:16
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -4031,6 +4029,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -4045,12 +4044,11 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
@@ -4058,6 +4056,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
@@ -4072,12 +4071,11 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: s_nop 8
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -4402,33 +4400,31 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, 2.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v18, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 8
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v18, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 8
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -4556,15 +4552,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mov_b32_e32 v18, 0x40004000
; GFX942-NEXT: v_mov_b32_e32 v19, v18
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0
-; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
@@ -4574,15 +4569,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, 0
; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 8
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -4770,41 +4764,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
-; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v34, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v33, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v33, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v33, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v33, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v33, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v33, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v33, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v33, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-VGPR-NEXT: s_nop 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v34, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 15
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[0:1]
; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 97a89ec819bae..1e9a90b82e898 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1193,6 +1193,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v22, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -1208,9 +1209,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b32_e32 v21, s29
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[30:31]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1269,11 +1269,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mov_b32_e32 v22, 0
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1332,11 +1332,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mov_b32_e32 v22, 0
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1395,11 +1395,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mov_b32_e32 v22, 0
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1458,11 +1458,11 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
-; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mov_b32_e32 v22, 0
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v22, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 0a36d3dd28f06..88024329d763f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3193,6 +3193,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; SDAG-NEXT: v_mov_b32_e32 v16, s8
@@ -3218,17 +3219,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT: v_mov_b32_e32 v32, s0
-; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: v_mov_b32_e32 v33, s0
+; SDAG-NEXT: v_mov_b32_e32 v34, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v34 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
@@ -3236,6 +3236,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; GISEL-NEXT: v_mov_b32_e32 v32, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
@@ -3253,17 +3254,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_mov_b32_e32 v32, s0
-; GISEL-NEXT: v_mov_b32_e32 v33, s1
+; GISEL-NEXT: v_mov_b32_e32 v33, s0
+; GISEL-NEXT: v_mov_b32_e32 v34, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v34 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -3274,10 +3274,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v32, -2
-; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v33, -2
+; SDAG-NEXT: v_mov_b32_e32 v34, 0x41
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
@@ -3287,7 +3286,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: v_mov_b32_e32 v22, s14
; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; SDAG-NEXT: v_mov_b32_e32 v24, s16
; SDAG-NEXT: v_mov_b32_e32 v25, s17
; SDAG-NEXT: v_mov_b32_e32 v26, s18
@@ -3296,22 +3294,25 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: v_mov_b32_e32 v29, s21
; SDAG-NEXT: v_mov_b32_e32 v30, s22
; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v34, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
@@ -3338,15 +3339,15 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mov_b32_e32 v34, 0
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: global_store_dwordx4 v34, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v34, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v34, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v34, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -3357,27 +3358,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s16
-; SDAG-NEXT: v_mov_b32_e32 v23, s17
-; SDAG-NEXT: v_mov_b32_e32 v24, s18
-; SDAG-NEXT: v_mov_b32_e32 v25, s19
-; SDAG-NEXT: v_mov_b32_e32 v26, s20
-; SDAG-NEXT: v_mov_b32_e32 v27, s21
-; SDAG-NEXT: v_mov_b32_e32 v28, s22
-; SDAG-NEXT: v_mov_b32_e32 v29, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v30, s24
-; SDAG-NEXT: v_mov_b32_e32 v31, s25
-; SDAG-NEXT: v_mov_b32_e32 v32, s26
+; SDAG-NEXT: v_mov_b32_e32 v32, s24
+; SDAG-NEXT: v_mov_b32_e32 v33, s25
+; SDAG-NEXT: v_mov_b32_e32 v34, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v33, s27
+; SDAG-NEXT: v_mov_b32_e32 v35, s27
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3385,22 +3387,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v18, s20
-; SDAG-NEXT: v_mov_b32_e32 v19, s21
-; SDAG-NEXT: v_mov_b32_e32 v20, s22
-; SDAG-NEXT: v_mov_b32_e32 v21, s23
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v18, s0
+; SDAG-NEXT: v_mov_b32_e32 v19, s1
+; SDAG-NEXT: v_mov_b32_e32 v36, s20
+; SDAG-NEXT: v_mov_b32_e32 v37, s21
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[20:27], v[28:35], v[0:15], v18, v19 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v38, s22
+; SDAG-NEXT: v_mov_b32_e32 v39, s23
; SDAG-NEXT: v_mov_b32_e32 v20, s16
; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[36:39], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s14
@@ -3432,8 +3432,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
@@ -3451,36 +3451,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v36, s0
-; GISEL-NEXT: v_mov_b32_e32 v37, s1
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b32_e32 v56, s0
+; GISEL-NEXT: v_mov_b32_e32 v57, s1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v56, v57 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
@@ -3493,26 +3492,27 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v16, 42
-; SDAG-NEXT: v_mov_b32_e32 v17, 25
+; SDAG-NEXT: v_mov_b32_e32 v18, 42
+; SDAG-NEXT: v_mov_b32_e32 v19, 25
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s16
-; SDAG-NEXT: v_mov_b32_e32 v23, s17
-; SDAG-NEXT: v_mov_b32_e32 v24, s18
-; SDAG-NEXT: v_mov_b32_e32 v25, s19
-; SDAG-NEXT: v_mov_b32_e32 v26, s20
-; SDAG-NEXT: v_mov_b32_e32 v27, s21
-; SDAG-NEXT: v_mov_b32_e32 v28, s22
-; SDAG-NEXT: v_mov_b32_e32 v29, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v30, s24
-; SDAG-NEXT: v_mov_b32_e32 v31, s25
-; SDAG-NEXT: v_mov_b32_e32 v32, s26
-; SDAG-NEXT: v_mov_b32_e32 v33, s27
+; SDAG-NEXT: v_mov_b32_e32 v32, s24
+; SDAG-NEXT: v_mov_b32_e32 v33, s25
+; SDAG-NEXT: v_mov_b32_e32 v34, s26
+; SDAG-NEXT: v_mov_b32_e32 v35, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
@@ -3522,20 +3522,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v18, s20
-; SDAG-NEXT: v_mov_b32_e32 v19, s21
-; SDAG-NEXT: v_mov_b32_e32 v20, s22
-; SDAG-NEXT: v_mov_b32_e32 v21, s23
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v36, s20
+; SDAG-NEXT: v_mov_b32_e32 v37, s21
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[20:27], v[28:35], v[0:15], v19, v18 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v38, s22
+; SDAG-NEXT: v_mov_b32_e32 v39, s23
; SDAG-NEXT: v_mov_b32_e32 v20, s16
; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[36:39], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v24, s14
@@ -3566,9 +3564,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v36, 25
-; GISEL-NEXT: v_mov_b32_e32 v37, 42
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 25
+; GISEL-NEXT: v_mov_b32_e32 v57, 42
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
@@ -3586,34 +3584,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v56, v57 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -3626,6 +3624,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
+; SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
+; SDAG-NEXT: v_mov_b64_e32 v[52:53], 16
+; SDAG-NEXT: v_mov_b64_e32 v[54:55], 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v32, s12
; SDAG-NEXT: v_mov_b32_e32 v33, s13
@@ -3645,53 +3647,50 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG-NEXT: v_mov_b32_e32 v46, s26
; SDAG-NEXT: v_mov_b32_e32 v47, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT: s_nop 14
; SDAG-NEXT: v_mov_b32_e32 v16, s20
; SDAG-NEXT: v_mov_b32_e32 v17, s21
; SDAG-NEXT: v_mov_b32_e32 v18, s22
; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[48:49], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: global_store_dwordx4 v[52:53], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[54:55], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[32:39], v[40:47], v[0:15] blgp:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: global_store_dwordx4 v[50:51], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[48:49], v[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[54:55], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[52:53], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3699,45 +3698,54 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[32:39], v[40:47], v[0:15] blgp:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -3750,8 +3758,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v32, 42
-; SDAG-NEXT: v_mov_b32_e32 v33, 25
+; SDAG-NEXT: v_mov_b32_e32 v38, 42
+; SDAG-NEXT: v_mov_b32_e32 v39, 25
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], 48
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
@@ -3779,44 +3788,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v39, v38 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 32
+; SDAG-NEXT: global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 16
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 0
; SDAG-NEXT: v_mov_b32_e32 v16, s12
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: v_mov_b32_e32 v18, s14
; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v16, s8
; SDAG-NEXT: v_mov_b32_e32 v17, s9
; SDAG-NEXT: v_mov_b32_e32 v18, s10
; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3824,9 +3831,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v32, 25
-; GISEL-NEXT: v_mov_b32_e32 v33, 42
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b32_e32 v56, 25
+; GISEL-NEXT: v_mov_b32_e32 v57, 42
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
@@ -3844,34 +3851,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v56, v57 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 63466f89e668f..2b28492632433 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -31,19 +31,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT: s_mov_b32 s8, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT: s_mov_b32 s9, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -78,19 +78,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT: s_mov_b32 s8, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT: s_mov_b32 s9, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_nop 6
+; GFX942-GISEL-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -103,12 +103,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 1.0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, 2.0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0x40400000
; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 4.0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -119,13 +120,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[20:21], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 10
+; GFX942-SDAG-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32:
@@ -134,11 +134,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000
+; GFX942-GISEL-NEXT: s_mov_b32 s20, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GFX942-GISEL-NEXT: s_mov_b32 s21, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -150,12 +151,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -174,6 +174,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 4.0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -185,12 +186,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_nop 10
+; GFX942-SDAG-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32_vgprcd:
@@ -199,11 +199,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000
+; GFX942-GISEL-NEXT: s_mov_b32 s20, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GFX942-GISEL-NEXT: s_mov_b32 s21, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v20, 0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -215,12 +216,11 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_nop 10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6132baceb1ce3..8fce10d6cc16f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -55,12 +55,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[6:7]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -104,12 +103,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
-; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[6:7]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -310,12 +308,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v17 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -341,16 +338,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-NEXT: v_mov_b32_e32 v29, s16
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
@@ -372,12 +368,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
; SDAG-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
-; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -403,16 +398,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
-; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1099,12 +1093,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v16, s16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; GCN-NEXT: v_mov_b32_e32 v17, s16
; GCN-NEXT: v_mov_b32_e32 v16, 0
-; GCN-NEXT: s_nop 10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v17 cbsz:1 abid:2
+; GCN-NEXT: s_nop 11
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -1130,12 +1123,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[12:13]
; GCN-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[10:11]
; GCN-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[8:9]
-; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s16
-; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
-; GCN-VGPR-NEXT: s_nop 0
-; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s16
; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GCN-VGPR-NEXT: s_nop 10
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v17 cbsz:1 abid:2
+; GCN-VGPR-NEXT: s_nop 11
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
@@ -1446,6 +1438,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -1458,9 +1451,8 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
@@ -1502,6 +1494,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -1514,9 +1507,8 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1711,6 +1703,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -1723,16 +1716,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: v_mov_b32_e32 v21, s1
; SDAG-NEXT: v_mov_b32_e32 v22, s2
; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
@@ -1743,6 +1735,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -1754,16 +1747,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b32_e32 v29, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
@@ -1779,6 +1771,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, 0
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -1791,16 +1784,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s16
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-VGPR-NEXT: s_endpgm
;
; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
@@ -1811,6 +1803,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -1822,16 +1815,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s2
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2345,6 +2337,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2357,9 +2350,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
@@ -2401,6 +2393,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2413,9 +2406,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2636,6 +2628,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2648,9 +2641,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
@@ -2692,6 +2684,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2704,9 +2697,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2927,6 +2919,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2939,9 +2932,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
@@ -2983,6 +2975,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -2995,9 +2988,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3218,6 +3210,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -3230,9 +3223,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
@@ -3274,6 +3266,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
@@ -3286,9 +3279,8 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-VGPR-NEXT: s_nop 6
-; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-VGPR-NEXT: s_nop 7
+; GISEL-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[0:1]
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3483,6 +3475,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -3495,16 +3488,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v21, s1
; SDAG-NEXT: v_mov_b32_e32 v22, s2
; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
@@ -3515,6 +3507,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -3526,16 +3519,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b32_e32 v29, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
@@ -3551,6 +3543,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, 0
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -3563,16 +3556,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s16
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-VGPR-NEXT: s_endpgm
;
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
@@ -3583,6 +3575,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -3594,16 +3587,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s2
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -4091,6 +4083,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -4103,16 +4096,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v21, s1
; SDAG-NEXT: v_mov_b32_e32 v22, s2
; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
@@ -4123,6 +4115,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -4134,16 +4127,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b32_e32 v29, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
@@ -4159,6 +4151,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, 0
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -4171,16 +4164,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s16
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-VGPR-NEXT: s_endpgm
;
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
@@ -4191,6 +4183,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -4202,16 +4195,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s2
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -4699,6 +4691,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -4711,16 +4704,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v21, s1
; SDAG-NEXT: v_mov_b32_e32 v22, s2
; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
@@ -4731,6 +4723,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -4742,16 +4735,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b32_e32 v29, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
@@ -4767,6 +4759,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, 0
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -4779,16 +4772,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s16
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-VGPR-NEXT: s_endpgm
;
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
@@ -4799,6 +4791,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -4810,16 +4803,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s2
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5307,6 +5299,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: v_mov_b32_e32 v24, s8
; SDAG-NEXT: v_mov_b32_e32 v25, s9
; SDAG-NEXT: v_mov_b32_e32 v26, s10
@@ -5319,16 +5312,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: v_mov_b32_e32 v21, s1
; SDAG-NEXT: v_mov_b32_e32 v22, s2
; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 10
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
@@ -5339,6 +5331,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT: v_mov_b32_e32 v28, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -5350,16 +5343,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-NEXT: v_mov_b32_e32 v29, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 10
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
@@ -5375,6 +5367,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, 0
; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
@@ -5387,16 +5380,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s16
; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
; SDAG-VGPR-NEXT: s_nop 0
-; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-VGPR-NEXT: s_nop 10
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[6:7] offset:16
; SDAG-VGPR-NEXT: s_endpgm
;
; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
@@ -5407,6 +5399,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, 0
; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
@@ -5418,16 +5411,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v29, s2
; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
; GISEL-VGPR-NEXT: s_nop 0
-; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-VGPR-NEXT: s_nop 10
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v29 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: s_nop 11
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v28, v[12:15], s[0:1] offset:48
; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index c57cdcac739ff..3e56a809c9845 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -705,6 +705,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY942-NEXT: v_mov_b32_e32 v32, 1.0
; GREEDY942-NEXT: v_mov_b32_e32 v33, 2.0
+; GREEDY942-NEXT: v_mov_b32_e32 v34, 0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
@@ -734,14 +735,13 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: v_mov_b32_e32 v13, v27
; GREEDY942-NEXT: v_mov_b32_e32 v14, v28
; GREEDY942-NEXT: v_mov_b32_e32 v15, v29
-; GREEDY942-NEXT: v_mov_b32_e32 v16, 0
-; GREEDY942-NEXT: s_nop 0
+; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v32, v33, v[0:15]
; GREEDY942-NEXT: s_nop 9
-; GREEDY942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GREEDY942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GREEDY942-NEXT: global_store_dwordx4 v34, v[12:15], s[16:17] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v34, v[8:11], s[16:17] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v34, v[4:7], s[16:17] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v34, v[0:3], s[16:17]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
diff --git a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
index a588e99980c6e..e9019ba097c12 100644
--- a/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
+++ b/llvm/test/CodeGen/AMDGPU/misched-remat-revert.ll
@@ -81,17 +81,9 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.exit:
- ; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_MOV_B32_1]].sub0
- ; CHECK-NEXT: undef [[S_MOV_B32_2:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 16843009
- ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_2]].sub0
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_2]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg2.kernarg.offset, align 8, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset, align 8, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 48, 0 :: (dereferenceable invariant load (s32) from %ir.x7.kernarg.offset, align 16, addrspace 4)
@@ -100,162 +92,170 @@ define amdgpu_kernel void @test_revert_schedule(i32 %arg0, i32 %arg1, ptr addrsp
; CHECK-NEXT: [[S_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 80, 0 :: (dereferenceable invariant load (s32) from %ir.a2.kernarg.offset, align 16, addrspace 4)
; CHECK-NEXT: early-clobber %119:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 84, 0 :: (dereferenceable invariant load (s64) from %ir.arg8.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: KILL [[COPY]](p4)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16384, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20480, implicit $exec
- ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16384, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[S_MOV_B32_2:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 16843009
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_2]].sub0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_2]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN4:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY %119.sub0
- ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub1:vreg_64_align2 = COPY %119.sub1
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM5]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM3]]
- ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_LOAD_DWORD_IMM4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM6]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN5:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY7]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM5]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM3]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN5:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN6:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_5]], [[S_MOV_B32_1]], 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[COPY7]]
+ ; CHECK-NEXT: KILL [[COPY5]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_5]]
+ ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_LOAD_DWORD_IMM4]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_ASHR_I32_]], 26, implicit-def dead $scc
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN7:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY9]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN8:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[COPY9]]
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LOAD_DWORD_IMM4]], [[S_LSHR_B32_]], implicit-def dead $scc
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_ADD_I32_]], 6, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[S_ASHR_I32_1]], 1, implicit-def dead $scc
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], 19456, implicit-def dead $scc
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN7:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY6]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN8:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY7]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[COPY7]]
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM6]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20480, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN9:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY8]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN10:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY10]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[COPY10]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN10:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[COPY8]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[S_MOV_B32_1]], 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM7]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY5]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM8]]
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN11:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY11]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN12:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY12]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[COPY11]]
- ; CHECK-NEXT: KILL [[COPY12]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY6]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM7]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_9]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM8]]
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN11:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY9]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN12:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY10]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[COPY9]]
+ ; CHECK-NEXT: KILL [[COPY10]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_10]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN13:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN14:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_MOV_B32_e32_6]], [[S_MOV_B32_1]], 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
; CHECK-NEXT: KILL [[V_MOV_B32_e32_6]]
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN15:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY13]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_11]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN15:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY11]], [[S_MOV_B32_1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) null`, align 1, addrspace 8)
; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_12]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY12:%[0-9]+]].sub0:vreg_64_align2 = COPY %119.sub0
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]].sub1:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
+ ; CHECK-NEXT: undef [[COPY13:%[0-9]+]].sub1:vreg_64_align2 = COPY %119.sub1
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_13]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]].sub0:vreg_64_align2 = COPY [[DS_READ_B32_gfx9_]].sub1
; CHECK-NEXT: [[V_OR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX4_IMM]].sub0, [[COPY1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e32_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, [[COPY1]](s32), implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e32_4:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 24, [[V_AND_B32_e32_1]], implicit $exec
; CHECK-NEXT: [[V_XOR_B32_e32_2:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[V_XOR_B32_e32_]], [[V_OR_B32_e32_3]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e32_5:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_XOR_B32_e32_2]], [[V_XOR_B32_e32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY12]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_15]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_LSHLREV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, [[V_OR_B32_e32_5]], implicit $exec
; CHECK-NEXT: [[V_XOR_B32_e32_3:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, [[V_OR_B32_e32_]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e32_6:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_LOAD_DWORDX4_IMM]].sub0, [[V_XOR_B32_e32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY13]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_16]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_XOR_B32_e32_4:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 1, [[V_OR_B32_e32_4]], implicit $exec
; CHECK-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 3, [[COPY1]](s32), implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_LSHLREV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 1, [[COPY1]](s32), implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_17]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: DS_WRITE_B128_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[BUFFER_LOAD_DWORDX4_OFFSET2]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) null`, addrspace 3)
- ; CHECK-NEXT: DS_WRITE_B128_gfx9 [[COPY8]], [[BUFFER_LOAD_DWORDX4_OFFEN2]], 0, 0, implicit $exec :: (store (s128) into %ir.p18, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B128_gfx9 [[COPY6]], [[BUFFER_LOAD_DWORDX4_OFFEN2]], 0, 0, implicit $exec :: (store (s128) into %ir.p18, addrspace 3)
; CHECK-NEXT: [[V_OR_B32_e32_7:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 1, [[V_AND_B32_e32_2]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_18]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM3]], [[V_OR_B32_e32_7]], implicit $exec
; CHECK-NEXT: DS_WRITE_B128_gfx9 [[V_ADD_U32_e32_3]], [[BUFFER_LOAD_DWORDX4_OFFSET3]], 512, 0, implicit $exec :: (store (s128) into %ir.9, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_19]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:av_32 = DS_READ_B32_gfx9 [[COPY14]], 0, 0, implicit $exec :: (load (s32) from %ir.4, align 8, addrspace 3)
; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_LSHLREV_B32_e32_1]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_OR_B32_e32_2]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_20]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_2:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_4]], 0, 0, implicit $exec :: (load (s32) from %ir.p10, addrspace 3)
; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:av_64_align2 = DS_READ_B64_gfx9 [[V_ADD_U32_e32_5]], 0, 0, implicit $exec :: (load (s64) from %ir.p11, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_3:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_21]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
; CHECK-NEXT: undef [[COPY15:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:av_64_align2 = COPY [[DS_READ_B64_gfx9_]].sub1
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_22]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM1]], [[V_LSHLREV_B32_e32_]], implicit $exec
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_4:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_6]], 0, 0, implicit $exec :: (load (s32) from %ir.p4, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_4:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_23]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_SUB_U32_e32_]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub1, [[V_ADD_U32_e32_7]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_29:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[DS_READ_B32_gfx9_5:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_8]], 0, 0, implicit $exec :: (load (s32) from %ir.p8, addrspace 3)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_24]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_B32_gfx9_5:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
; CHECK-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_OR_B32_e32_1]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORDX4_IMM]].sub3, [[V_LSHLREV_B32_e32_]], implicit $exec
- ; CHECK-NEXT: undef [[DS_READ_B32_gfx9_5:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_8]], 0, 0, implicit $exec :: (load (s32) from %ir.p8, addrspace 3)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_25]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_6:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_9]], 0, 0, implicit $exec :: (load (s32) from %ir.sunkaddr3, addrspace 3)
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_7:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_10]], 0, 0, implicit $exec :: (load (s32) from %ir.p5, addrspace 3)
; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_7]], 0, 0, implicit $exec :: (load (s32) from %ir.i85, align 8, addrspace 3)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_26]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_B32_gfx9_6:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[DS_READ_B32_gfx9_7:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
; CHECK-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[S_LOAD_DWORD_IMM2]], [[V_OR_B32_e32_6]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_27]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_8:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_ADD_U32_e32_11]], 0, 0, implicit $exec :: (load (s32) from %ir.p2, align 16, addrspace 3)
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_9:%[0-9]+]].sub1:av_64_align2 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e32_2]], 8192, 0, implicit $exec :: (load (s32) from %ir.p13, addrspace 3)
; CHECK-NEXT: undef [[DS_READ_B32_gfx9_10:%[0-9]+]].sub0:av_64_align2 = DS_READ_B32_gfx9 [[V_XOR_B32_e32_4]], 8192, 0, implicit $exec :: (load (s32) from %ir.p9, align 16, addrspace 3)
- ; CHECK-NEXT: [[DS_READ_B32_gfx9_5:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
- ; CHECK-NEXT: [[DS_READ_B32_gfx9_6:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
- ; CHECK-NEXT: [[DS_READ_B32_gfx9_7:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_29:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_28]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_9:%[0-9]+]].sub0:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
; CHECK-NEXT: SCHED_BARRIER 0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_30:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN15]].sub2_sub3, 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]].sub1:av_64_align2 = COPY [[DS_READ_B32_gfx9_1]]
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_30:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DS_READ_B32_gfx9_8:%[0-9]+]].sub1:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
; CHECK-NEXT: [[DS_READ_B32_gfx9_10:%[0-9]+]].sub1:av_64_align2 = COPY [[S_MOV_B32_1]].sub0
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_31:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY4]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_14]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHRREV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]].sub0, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_31:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_2]], [[BUFFER_LOAD_DWORDX4_OFFEN4]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: undef [[V_AND_OR_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_AND_OR_B32_e64 [[V_LSHLREV_B32_e32_3]], 56, [[V_MUL_LO_U32_e64_]], implicit $exec
; CHECK-NEXT: [[V_AND_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]](s32), 48, [[V_AND_B32_e32_]], implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_32:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_2]], [[BUFFER_LOAD_DWORDX4_OFFEN4]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_LSHLREV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 2, [[V_AND_OR_B32_e64_1]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_32:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_30]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_33:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY15]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_31]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_34:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN5]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_32]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_35:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B64_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_33]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_36:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN5]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_34]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_37:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN6]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_35]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_38:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN9]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_36]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_39:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_4]], [[BUFFER_LOAD_DWORDX4_OFFEN6]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_37]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_40:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN9]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_38]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_41:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_5]], [[BUFFER_LOAD_DWORDX4_OFFEN8]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_39]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_42:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN10]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_40]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_43:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_3]], [[BUFFER_LOAD_DWORDX4_OFFEN8]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_41]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_44:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN10]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_42]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_45:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_6]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_43]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_46:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN7]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_44]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_47:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_45]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_48:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_9]], [[BUFFER_LOAD_DWORDX4_OFFEN7]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_46]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_49:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_7]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_47]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_50:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN11]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_48]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_51:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN13]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_49]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_52:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN11]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_50]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_53:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN3]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_29]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_54:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN14]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_51]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_55:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_10]], [[BUFFER_LOAD_DWORDX4_OFFEN12]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_52]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_56:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_8]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_53]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_57:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN15]].sub2_sub3, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: DS_WRITE_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_57]].sub0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) null`, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_58:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN14]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_54]], 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_59:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN15]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_55]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_30]].sub0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) null`, addrspace 3)
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_33:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_31]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_34:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[COPY15]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_32]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_35:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN5]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_33]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_36:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B64_gfx9_]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_34]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_37:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN5]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_35]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_38:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN6]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_36]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_39:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN7]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_37]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_40:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_4]], [[BUFFER_LOAD_DWORDX4_OFFEN6]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_38]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_41:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN7]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_39]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_42:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_5]], [[BUFFER_LOAD_DWORDX4_OFFEN10]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_40]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_43:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN8]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_41]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_44:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_3]], [[BUFFER_LOAD_DWORDX4_OFFEN10]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_42]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_45:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN8]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_43]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_46:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_6]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_44]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_47:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN9]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_45]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_48:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_46]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_49:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_9]], [[BUFFER_LOAD_DWORDX4_OFFEN9]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_47]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_50:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_7]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_48]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_51:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN11]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_49]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_52:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN13]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_50]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_53:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN11]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_51]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_54:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN3]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_29]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_55:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN14]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_52]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_56:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_10]], [[BUFFER_LOAD_DWORDX4_OFFEN12]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_53]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_57:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B32_gfx9_8]], [[V_MOV_B1]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_54]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_58:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN14]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_55]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_59:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN15]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_56]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: DS_WRITE_B32_gfx9 [[V_LSHLREV_B32_e32_4]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_59]].sub0, 0, 0, implicit $exec :: (store (s32) into %ir.p21, addrspace 3)
; CHECK-NEXT: DS_WRITE_B32_gfx9 [[DS_READ_B32_gfx9_]].sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_58]].sub0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) null`, addrspace 3)
- ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN13]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_56]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[V_MOV_B1]], [[BUFFER_LOAD_DWORDX4_OFFEN13]].sub0_sub1, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_57]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: DS_WRITE_B32_gfx9 [[COPY14]], [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_60]].sub0, 0, 0, implicit $exec :: (store (s32) into %ir.4, addrspace 3)
; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_ASHRREV_I32_e32 31, [[V_AND_OR_B32_e64_]].sub0, implicit $exec
; CHECK-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 1, [[V_AND_OR_B32_e64_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 401ad6489bf6e..58cc47cdab976 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 s2, 0
; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: s_mov_b32 s6, 0
+; GFX942-NEXT: s_mov_b32 s3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -18,16 +18,16 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: s_branch .LBB0_2
; GFX942-NEXT: .LBB0_1: ; %bb2
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT: s_or_b32 s4, s3, 1
+; GFX942-NEXT: s_ashr_i32 s5, s3, 31
; GFX942-NEXT: s_mov_b32 s3, s2
-; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
; GFX942-NEXT: v_mov_b32_e32 v2, v1
; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: s_or_b32 s4, s6, 1
-; GFX942-NEXT: s_ashr_i32 s3, s6, 31
+; GFX942-NEXT: s_and_b32 s3, s5, s4
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[4:5], v[4:5], v[0:3]
-; GFX942-NEXT: s_and_b32 s6, s3, s4
-; GFX942-NEXT: s_nop 5
+; GFX942-NEXT: s_nop 6
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_cbranch_execz .LBB0_4
; GFX942-NEXT: .LBB0_2: ; %bb
@@ -35,7 +35,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
; GFX942-NEXT: ; %bb.3:
-; GFX942-NEXT: ; implicit-def: $sgpr6
+; GFX942-NEXT: ; implicit-def: $sgpr3
; GFX942-NEXT: .LBB0_4: ; %common.ret
; GFX942-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 47ebd072c4cc7..07d2c7c1954c0 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -136,21 +136,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v34, 0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_nop 15
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v34, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v34, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v34, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v34, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v34, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v34, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v34, v[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx4 v34, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -169,21 +169,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b32_e32 v34, 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 15
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_store_dwordx4 v34, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v34, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v34, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v34, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v34, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v34, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v34, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v34, v[0:3], s[0:1]
; CHECK-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -202,21 +202,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(
; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b32_e32 v34, 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 15
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_store_dwordx4 v34, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v34, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v34, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v34, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v34, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v34, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v34, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v34, v[0:3], s[0:1]
; CHECK-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list