[llvm-branch-commits] [llvm] [AMDGPU] Add MemoryPipeline scheduling to Coexec sched (PR #192325)
Jeffrey Byrnes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Apr 24 09:56:13 PDT 2026
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/192325
>From 6edc82a25c6ca54cc4b20c0cec42cce4d614e6f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 20 Mar 2026 14:58:27 -0700
Subject: [PATCH 1/7] [AMDGPU] Add MemoryPipeline scheduling to Coexec sched
Change-Id: I52c476834155823d1ba998cdbbcb3ad6a7e6f2f5
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 100 ++++--
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 18 +
.../AMDGPU/coexec-sched-effective-stall.mir | 323 ++++++++++++++++++
3 files changed, 418 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 71d016dfea6ae..0292deee4ae9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -52,7 +52,7 @@ InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
// Check for specific opcodes first.
if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
- Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+ Opc == AMDGPU::S_BARRIER_SIGNAL_IMM || SII.isWaitcnt(Opc))
return InstructionFlavor::Fence;
if (SII.isLDSDMA(MI))
@@ -409,19 +409,8 @@ unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
}
bool CandidateHeuristics::tryEffectiveStall(
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::SchedCandidate &TryCand, SchedBoundary &Zone) {
- // Treat structural and latency stalls as a single scheduling cost for the
- // current cycle.
- struct StallCosts {
- unsigned Ready = 0;
- unsigned Structural = 0;
- unsigned Latency = 0;
- unsigned Effective = 0;
- unsigned Carried = 0;
- unsigned Buffer = 0;
- };
-
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) {
auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
InstructionFlavor Flavor = classifyFlavor(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
@@ -442,6 +431,43 @@ bool CandidateHeuristics::tryEffectiveStall(
};
unsigned CurrCycle = Zone.getCurrCycle();
+
+ auto getFenceStalls = [this, &CurrCycle](SUnit *SU) -> unsigned {
+ InstructionFlavor Flavor = classifyFlavor(
+ *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
+
+ if (Flavor != InstructionFlavor::Fence)
+ return 0;
+
+ HardwareUnitInfo *FenceHWUI = getHWUIFromFlavor(Flavor);
+ HardwareUnitInfo *DSHWUI = getHWUIFromFlavor(InstructionFlavor::DS);
+
+ SUnit *LastDS = DSHWUI->getLastScheduledSU();
+ if (!LastDS)
+ return 0;
+
+ SUnit *LastFence = FenceHWUI->getLastScheduledSU();
+ unsigned LastFenceCycle = LastFence ? LastFence->TopReadyCycle : 0;
+ unsigned LastDSCycle = LastDS->TopReadyCycle;
+
+ if (LastDSCycle < LastFenceCycle)
+ return 0;
+
+ unsigned LastDSFinish = LastDSCycle + getHWUICyclesForSU(LastDS);
+ return LastDSFinish <= CurrCycle ? 0 : LastDSFinish - CurrCycle;
+ };
+
+ // Treat stalls as a single scheduling cost for the current cycle.
+ struct StallCosts {
+ unsigned Ready = 0;
+ unsigned Structural = 0;
+ unsigned Latency = 0;
+ unsigned Carried = 0;
+ unsigned Buffer = 0;
+ unsigned Fence = 0;
+ unsigned Effective = 0;
+ };
+
auto GetStallCosts = [&](SUnit *SU) {
unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
StallCosts Costs;
@@ -451,9 +477,9 @@ bool CandidateHeuristics::tryEffectiveStall(
unsigned CarriedLatency = CarriedLatencies.lookup_or(SU->getInstr(), 0);
Costs.Carried = CarriedLatency > CurrCycle ? CarriedLatency - CurrCycle : 0;
Costs.Buffer = getBufferFullStalls(SU);
-
+ Costs.Fence = getFenceStalls(SU);
Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency,
- Costs.Carried, Costs.Buffer});
+ Costs.Carried, Costs.Buffer, Costs.Fence});
return Costs;
};
@@ -475,6 +501,33 @@ bool CandidateHeuristics::tryEffectiveStall(
AMDGPUCoExecSchedStrategy::Stall);
}
+bool CandidateHeuristics::tryMemoryPipeline(
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand) {
+
+ InstructionFlavor TryFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
+
+ InstructionFlavor CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+
+ bool TryIsMemoryPipeline = TryFlavor == InstructionFlavor::DMA ||
+ TryFlavor == InstructionFlavor::Fence;
+ bool CandIsMemoryPipeline = CandFlavor == InstructionFlavor::DMA ||
+ CandFlavor == InstructionFlavor::Fence;
+
+ if (TryIsMemoryPipeline == CandIsMemoryPipeline)
+ return false;
+
+ if (CandIsMemoryPipeline) {
+ if (Cand.Reason > GenericSchedulerBase::RegCritical)
+ Cand.Reason = GenericSchedulerBase::RegCritical;
+
+ return true;
+ }
+
+ TryCand.Reason = GenericSchedulerBase::RegCritical;
+ return true;
+}
+
bool CandidateHeuristics::tryCriticalResourceDependency(
GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
@@ -806,8 +859,15 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
if (SameBoundary) {
// Compare candidates by the stall they would introduce if
// scheduled in the current cycle.
- if (Heurs.tryEffectiveStall(Cand, TryCand, *Zone))
+ if (Heurs.tryEffectiveStall(TryCand, Cand, *Zone)) {
+ LastAMDGPUReason = AMDGPUSchedReason::Stall;
return TryCand.Reason != NoCand;
+ }
+
+ if (Heurs.tryMemoryPipeline(TryCand, Cand)) {
+ LastAMDGPUReason = AMDGPUSchedReason::MemoryPipeline;
+ return TryCand.Reason != NoCand;
+ }
Heurs.sortHWUIResources();
if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
@@ -852,12 +912,6 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
if (SameBoundary) {
- // Avoid serializing long latency dependence chains.
- // For acyclic path limited loops, latency was already checked above.
- if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
- !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
- return TryCand.Reason != NoCand;
-
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 177f77dde7562..4b1568c674300 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -127,6 +127,8 @@ inline FlavorGroup all() {
/// than the generic CandReason enum for debugging purposes.
enum class AMDGPUSchedReason : uint8_t {
None,
+ Stall,
+ MemoryPipeline,
CritResourceBalance, // tryCriticalResource chose based on resource pressure
CritResourceDep, // tryCriticalResourceDependency chose based on enabling
NUM_REASONS
@@ -136,6 +138,10 @@ constexpr StringRef getReasonName(AMDGPUSchedReason R) {
switch (R) {
case AMDGPUSchedReason::None:
return "None";
+ case AMDGPUSchedReason::Stall:
+ return "Stall";
+ case AMDGPUSchedReason::MemoryPipeline:
+ return "MemoryPipeline";
case AMDGPUSchedReason::CritResourceBalance:
return "CritResource";
case AMDGPUSchedReason::CritResourceDep:
@@ -236,6 +242,15 @@ class HardwareUnitInfo {
ScheduledSUs[ScheduledSUs.size() - BufferSize]->TopReadyCycle;
}
+ /// \returns the most recently scheduled SU for this HardwareUnit.
+ SUnit *getLastScheduledSU() {
+ unsigned ScheduledCount = ScheduledSUs.size();
+ if (!ScheduledCount)
+ return nullptr;
+
+ return ScheduledSUs[ScheduledCount - 1];
+ }
+
/// \returns the SUnit with higher priority or nullptr if they are the same.
/// This method looks through the PrioritySUs to determine if one SU is more
/// prioritized than the other. If neither are in the PrioritySUs list, then
@@ -349,6 +364,9 @@ class CandidateHeuristics {
GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary &Zone);
+ bool tryMemoryPipeline(GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand);
+
/// Check for critical resource consumption. Prefer the candidate that uses
/// the most prioritized HardwareUnit. If both candidates use the same
/// HarwareUnit, prefer the candidate with higher priority on that
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 0a6f2fe9375d5..7868c7dcf88a3 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -5,6 +5,9 @@
--- |
define void @test-sched-effective-stall() #0 { ret void }
define void @test-sched-pending-structural-stall() #0 { ret void }
+ define void @test-fence-stall() #0 { ret void }
+ define void @test-tensorcnt-stall() #0 { ret void }
+ define void @test-dscnt-stall() #0 { ret void }
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
...
@@ -121,3 +124,323 @@ body: |
S_NOP 0
S_ENDPGM 0, implicit %10, implicit %11
...
+
+
+---
+name: test-fence-stall
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; DEFAULT-LABEL: name: test-fence-stall
+ ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+ ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+ ; DEFAULT-NEXT: ATOMIC_FENCE 4, 2
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ ;
+ ; COEXEC-LABEL: name: test-fence-stall
+ ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
+ ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ %95:vgpr_32 = IMPLICIT_DEF
+ %90:vreg_256_align2 = IMPLICIT_DEF
+ %91:vreg_256_align2 = IMPLICIT_DEF
+ %85:sreg_64_xexec = IMPLICIT_DEF
+ %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %2373:vreg_512_align2 = IMPLICIT_DEF
+ %2369:vreg_512_align2 = IMPLICIT_DEF
+ %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %8:vgpr_32_lo256 = IMPLICIT_DEF
+ undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+ %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+ undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+ %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+ undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+ %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+ undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+ %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+ undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+ %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+ undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+ %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+ undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+ %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+ %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+ %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ ATOMIC_FENCE 4, 2
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, amdgpu_allvgprs
+...
+
+
+---
+name: test-tensorcnt-stall
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; DEFAULT-LABEL: name: test-tensorcnt-stall
+ ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+ ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+ ; DEFAULT-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ ;
+ ; COEXEC-LABEL: name: test-tensorcnt-stall
+ ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ %95:vgpr_32 = IMPLICIT_DEF
+ %90:vreg_256_align2 = IMPLICIT_DEF
+ %91:vreg_256_align2 = IMPLICIT_DEF
+ %85:sreg_64_xexec = IMPLICIT_DEF
+ %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %2373:vreg_512_align2 = IMPLICIT_DEF
+ %2369:vreg_512_align2 = IMPLICIT_DEF
+ %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %8:vgpr_32_lo256 = IMPLICIT_DEF
+ undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+ %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+ undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+ %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+ undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+ %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+ undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+ %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+ undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+ %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+ undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+ %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+ undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+ %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+ %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+ %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, amdgpu_allvgprs
+...
+
+---
+name: test-dscnt-stall
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; DEFAULT-LABEL: name: test-dscnt-stall
+ ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 448, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 512, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 576, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 640, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 704, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 768, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 832, 0, implicit $exec
+ ; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 896, 0, implicit $exec
+ ; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 960, 0, implicit $exec
+ ; DEFAULT-NEXT: dead [[DEF5:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF6:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
+ ; DEFAULT-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ ;
+ ; COEXEC-LABEL: name: test-dscnt-stall
+ ; COEXEC: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 64, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 128, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 192, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 256, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 320, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 384, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 448, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 512, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_4:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 576, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 640, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_5:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 704, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 768, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_6:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 832, 0, implicit $exec
+ ; COEXEC-NEXT: undef [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 896, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DS_LOAD_TR16_B128_7:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF]], 960, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+ %95:vgpr_32 = IMPLICIT_DEF
+ %90:vreg_256_align2 = IMPLICIT_DEF
+ %91:vreg_256_align2 = IMPLICIT_DEF
+ %85:sreg_64_xexec = IMPLICIT_DEF
+ %249:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %2373:vreg_512_align2 = IMPLICIT_DEF
+ %2369:vreg_512_align2 = IMPLICIT_DEF
+ %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %8:vgpr_32_lo256 = IMPLICIT_DEF
+ undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
+ %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
+ undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
+ %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
+ undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
+ %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
+ undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
+ %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
+ undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
+ %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
+ undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
+ %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
+ undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
+ %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
+ %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
+ %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, amdgpu_allvgprs
+...
>From 8203e7624d0df463aa76282e9ab1418e31e39fde Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 25 Mar 2026 18:22:24 -0700
Subject: [PATCH 2/7] Add a comment
Change-Id: I447f7f1fb185b18924cfd98249b5a0a05fef2484
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 4b1568c674300..6e056a9554afa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -364,6 +364,13 @@ class CandidateHeuristics {
GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary &Zone);
+ /// Prioritize instructions involved the memory pipeline. Currently we don't have
+ /// any modelling of pipelined loads, so we control the layout of the pipeline
+ /// per iteration by giving the user some control over the stalls (e.g. between
+ /// s_barrier_signal and s_barrier_wait) and scheduling the pipeline instructions
+ /// as soon as they are ready.
+ ///
+ /// TODO -- add better modelling and heuristics for pipelining based scheduling.
bool tryMemoryPipeline(GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand);
>From 2bfdfe27b509bfde351dea64571971d919f8a06c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:04:56 -0700
Subject: [PATCH 3/7] Make fence heuristic work bottom-up
Change-Id: I629cbc8905b87a962e8b123287e5f60a3154df6b
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 36 ++++++++++---------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 0292deee4ae9c..1db3f6201d1b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -375,8 +375,7 @@ void CandidateHeuristics::sortHWUIResources() {
unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
SUnit *SU) {
- // Only implemented for top-down scheduling currently.
- if (!Zone.isTop() || !SU)
+ if (!SU)
return 0;
MachineInstr *MI = SU->getInstr();
@@ -411,6 +410,10 @@ unsigned CandidateHeuristics::getStructuralStallCycles(SchedBoundary &Zone,
bool CandidateHeuristics::tryEffectiveStall(
GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) {
+ // Only implemented for top-down scheduling
+ if (!Zone.isTop())
+ return 0;
+
auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
InstructionFlavor Flavor = classifyFlavor(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
@@ -420,8 +423,6 @@ bool CandidateHeuristics::tryEffectiveStall(
if (HWUI->getBufferSize() == 0)
return 0;
- // getBufferAvailableCycle assumes top-down scheduling.
- assert(Zone.isTop());
unsigned CurrCycle = Zone.getCurrCycle();
unsigned BufferReadyCycle = HWUI->getBufferAvailableCycle(CurrCycle);
if (BufferReadyCycle <= CurrCycle)
@@ -432,29 +433,30 @@ bool CandidateHeuristics::tryEffectiveStall(
unsigned CurrCycle = Zone.getCurrCycle();
- auto getFenceStalls = [this, &CurrCycle](SUnit *SU) -> unsigned {
+ auto getFenceStalls = [this, &CurrCycle, &Zone](SUnit *SU) -> unsigned {
InstructionFlavor Flavor = classifyFlavor(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
- if (Flavor != InstructionFlavor::Fence)
+ bool IsTop = Zone.isTop();
+ if ((Flavor != InstructionFlavor::Fence && IsTop) || (Flavor != InstructionFlavor::DS && !IsTop))
return 0;
- HardwareUnitInfo *FenceHWUI = getHWUIFromFlavor(Flavor);
- HardwareUnitInfo *DSHWUI = getHWUIFromFlavor(InstructionFlavor::DS);
+ HardwareUnitInfo *ConsumerHWUI = getHWUIFromFlavor(Flavor);
+ HardwareUnitInfo *ProducerHWUI = getHWUIFromFlavor(IsTop ? InstructionFlavor::DS : InstructionFlavor::Fence);
- SUnit *LastDS = DSHWUI->getLastScheduledSU();
- if (!LastDS)
+ SUnit *LastProducer = ProducerHWUI->getLastScheduledSU();
+ if (!LastProducer)
return 0;
- SUnit *LastFence = FenceHWUI->getLastScheduledSU();
- unsigned LastFenceCycle = LastFence ? LastFence->TopReadyCycle : 0;
- unsigned LastDSCycle = LastDS->TopReadyCycle;
+ SUnit *LastConsumer = ConsumerHWUI->getLastScheduledSU();
+ unsigned LastConsumerCycle = LastConsumer ? LastConsumer->TopReadyCycle : 0;
+ unsigned LastProducerCycle = LastProducer->TopReadyCycle;
- if (LastDSCycle < LastFenceCycle)
+ if (LastProducerCycle < LastConsumerCycle)
return 0;
- unsigned LastDSFinish = LastDSCycle + getHWUICyclesForSU(LastDS);
- return LastDSFinish <= CurrCycle ? 0 : LastDSFinish - CurrCycle;
+ unsigned FenceStallFinish = LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
+ return FenceStallFinish <= CurrCycle ? 0 : FenceStallFinish - CurrCycle;
};
// Treat stalls as a single scheduling cost for the current cycle.
@@ -911,7 +913,7 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
- if (SameBoundary) {
+ if (SameBoundary) {
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
>From 466f439973fe56e3ed34e14ca73dd5f809e3e1af Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:08:36 -0700
Subject: [PATCH 4/7] Add back tryLatency
Change-Id: I12d4f255c48ed77ba927eb3b192e5903f1f5e24f
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 1db3f6201d1b0..b4751027960b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -913,7 +913,13 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
- if (SameBoundary) {
+ if (SameBoundary) {
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
>From 64b74c414ba4e7675b5fe18aec622b330d217129 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:09:51 -0700
Subject: [PATCH 5/7] clang-format
Change-Id: I534b1a979f55339a814ef3416c2492252845add5
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index b4751027960b4..e95ee4d0d7092 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -438,11 +438,13 @@ bool CandidateHeuristics::tryEffectiveStall(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
bool IsTop = Zone.isTop();
- if ((Flavor != InstructionFlavor::Fence && IsTop) || (Flavor != InstructionFlavor::DS && !IsTop))
+ if ((Flavor != InstructionFlavor::Fence && IsTop) ||
+ (Flavor != InstructionFlavor::DS && !IsTop))
return 0;
HardwareUnitInfo *ConsumerHWUI = getHWUIFromFlavor(Flavor);
- HardwareUnitInfo *ProducerHWUI = getHWUIFromFlavor(IsTop ? InstructionFlavor::DS : InstructionFlavor::Fence);
+ HardwareUnitInfo *ProducerHWUI = getHWUIFromFlavor(
+ IsTop ? InstructionFlavor::DS : InstructionFlavor::Fence);
SUnit *LastProducer = ProducerHWUI->getLastScheduledSU();
if (!LastProducer)
@@ -455,7 +457,8 @@ bool CandidateHeuristics::tryEffectiveStall(
if (LastProducerCycle < LastConsumerCycle)
return 0;
- unsigned FenceStallFinish = LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
+ unsigned FenceStallFinish =
+ LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
return FenceStallFinish <= CurrCycle ? 0 : FenceStallFinish - CurrCycle;
};
>From 6744fde8019a9e47f28d5e51ca442482fc71983e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 Apr 2026 16:11:50 -0700
Subject: [PATCH 6/7] Add comment
Change-Id: I2180bba631fe4a01ed3c3fbcfa8c19cbefa84133
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index e95ee4d0d7092..eb259e2d1a09d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -457,6 +457,7 @@ bool CandidateHeuristics::tryEffectiveStall(
if (LastProducerCycle < LastConsumerCycle)
return 0;
+ // Latency comes from DS regardless of bottom-up / top-down.
unsigned FenceStallFinish =
LastProducerCycle + getHWUICyclesForSU(IsTop ? LastProducer : SU);
return FenceStallFinish <= CurrCycle ? 0 : FenceStallFinish - CurrCycle;
>From 3e26f22711c122ee2883c2c981763f7b9c22b639 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 15 Apr 2026 16:14:27 -0700
Subject: [PATCH 7/7] Adrress comments from
https://github.com/llvm/llvm-project/pull/188658
Change-Id: Ia94c567a753168c1ffa16dc5d91195e7dd0ba044
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 6 +-
.../AMDGPU/coexec-sched-effective-stall.mir | 222 +++++++++---------
2 files changed, 114 insertions(+), 114 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index eb259e2d1a09d..d710d40a7ea98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -680,9 +680,9 @@ void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
- assert((PreRADirection == MISched::Unspecified ||
- PreRADirection == MISched::TopDown) &&
- "coexec scheduler only supports top-down scheduling");
+ if (PreRADirection == MISched::BottomUp ||
+ PreRADirection == MISched::Bidirectional)
+ report_fatal_error("CoExecSchedStrategy only support TopDown scheduling.");
RegionPolicy.OnlyTopDown = true;
RegionPolicy.OnlyBottomUp = false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 7868c7dcf88a3..772eb91d44c59 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -136,14 +136,14 @@ body: |
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -162,7 +162,7 @@ body: |
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
; DEFAULT-NEXT: ATOMIC_FENCE 4, 2
- ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
;
; COEXEC-LABEL: name: test-fence-stall
@@ -191,44 +191,44 @@ body: |
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
- ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: ATOMIC_FENCE 4, 2
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
- %95:vgpr_32 = IMPLICIT_DEF
- %90:vreg_256_align2 = IMPLICIT_DEF
- %91:vreg_256_align2 = IMPLICIT_DEF
- %85:sreg_64_xexec = IMPLICIT_DEF
- %249:sreg_32_xm0_xexec = IMPLICIT_DEF
- %2373:vreg_512_align2 = IMPLICIT_DEF
- %2369:vreg_512_align2 = IMPLICIT_DEF
- %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vreg_256_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:sreg_64_xexec = IMPLICIT_DEF
+ %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %5:vreg_512_align2 = IMPLICIT_DEF
+ %6:vreg_512_align2 = IMPLICIT_DEF
+ %7:vreg_128_lo256_align2 = IMPLICIT_DEF
%8:vgpr_32_lo256 = IMPLICIT_DEF
- undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
- %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
- undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
- %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
- undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
- %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
- undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
- %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
- undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
- %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
- undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
- %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
- undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
- %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
- %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
- undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
- %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+ %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+ undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+ %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+ undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+ %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+ undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+ %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+ undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+ %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+ undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+ %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+ undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+ %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+ %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+ %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
ATOMIC_FENCE 4, 2
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0, amdgpu_allvgprs
...
@@ -243,14 +243,14 @@ body: |
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -269,7 +269,7 @@ body: |
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
; DEFAULT-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
- ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
;
; COEXEC-LABEL: name: test-tensorcnt-stall
@@ -298,44 +298,44 @@ body: |
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
- ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
- %95:vgpr_32 = IMPLICIT_DEF
- %90:vreg_256_align2 = IMPLICIT_DEF
- %91:vreg_256_align2 = IMPLICIT_DEF
- %85:sreg_64_xexec = IMPLICIT_DEF
- %249:sreg_32_xm0_xexec = IMPLICIT_DEF
- %2373:vreg_512_align2 = IMPLICIT_DEF
- %2369:vreg_512_align2 = IMPLICIT_DEF
- %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vreg_256_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:sreg_64_xexec = IMPLICIT_DEF
+ %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %5:vreg_512_align2 = IMPLICIT_DEF
+ %6:vreg_512_align2 = IMPLICIT_DEF
+ %7:vreg_128_lo256_align2 = IMPLICIT_DEF
%8:vgpr_32_lo256 = IMPLICIT_DEF
- undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
- %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
- undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
- %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
- undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
- %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
- undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
- %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
- undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
- %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
- undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
- %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
- undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
- %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
- %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
- undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
- %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+ %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+ undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+ %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+ undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+ %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+ undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+ %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+ undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+ %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+ undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+ %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+ undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+ %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+ %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+ %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
S_WAIT_TENSORCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0, amdgpu_allvgprs
...
@@ -349,14 +349,14 @@ body: |
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 0, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 64, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 128, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_1:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 192, 0, implicit $exec
- ; DEFAULT-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 256, 0, implicit $exec
; DEFAULT-NEXT: dead undef [[DS_LOAD_TR16_B128_2:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 320, 0, implicit $exec
; DEFAULT-NEXT: undef [[DS_LOAD_TR16_B128_3:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 [[DEF4]], 384, 0, implicit $exec
@@ -375,7 +375,7 @@ body: |
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
; DEFAULT-NEXT: dead [[DEF8:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF8]], [[DEF7]].sub1, implicit-def dead $scc
; DEFAULT-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
- ; DEFAULT-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 8, 0, [[DEF2]].sub0, [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, amdgpu_allvgprs
;
; COEXEC-LABEL: name: test-dscnt-stall
@@ -404,43 +404,43 @@ body: |
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_128_lo256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = S_ADD_I32 [[DEF4]], [[DEF3]].sub1, implicit-def dead $scc
- ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
; COEXEC-NEXT: dead early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; COEXEC-NEXT: dead early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
+ ; COEXEC-NEXT: dead early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: dead early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 8, 0, [[DEF7]].sub0, [[DEF8]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: S_ENDPGM 0, amdgpu_allvgprs
- %95:vgpr_32 = IMPLICIT_DEF
- %90:vreg_256_align2 = IMPLICIT_DEF
- %91:vreg_256_align2 = IMPLICIT_DEF
- %85:sreg_64_xexec = IMPLICIT_DEF
- %249:sreg_32_xm0_xexec = IMPLICIT_DEF
- %2373:vreg_512_align2 = IMPLICIT_DEF
- %2369:vreg_512_align2 = IMPLICIT_DEF
- %2353:vreg_128_lo256_align2 = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vreg_256_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:sreg_64_xexec = IMPLICIT_DEF
+ %4:sreg_32_xm0_xexec = IMPLICIT_DEF
+ %5:vreg_512_align2 = IMPLICIT_DEF
+ %6:vreg_512_align2 = IMPLICIT_DEF
+ %7:vreg_128_lo256_align2 = IMPLICIT_DEF
%8:vgpr_32_lo256 = IMPLICIT_DEF
- undef %134.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 0, 0, implicit $exec
- %134.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 64, 0, implicit $exec
- undef %143.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 128, 0, implicit $exec
- %143.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 192, 0, implicit $exec
- undef %152.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 256, 0, implicit $exec
- %152.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 320, 0, implicit $exec
- undef %161.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 384, 0, implicit $exec
- %161.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 448, 0, implicit $exec
- undef %170.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 512, 0, implicit $exec
- %170.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 576, 0, implicit $exec
- undef %179.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 640, 0, implicit $exec
- %179.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 704, 0, implicit $exec
- undef %188.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 768, 0, implicit $exec
- %188.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 832, 0, implicit $exec
- %249:sreg_32_xm0_xexec = S_ADD_I32 %249:sreg_32_xm0_xexec, %85.sub1:sreg_64_xexec, implicit-def dead $scc
- undef %197.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 896, 0, implicit $exec
- %197.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %95:vgpr_32, 960, 0, implicit $exec
+ undef %9.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 0, 0, implicit $exec
+ %9.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 64, 0, implicit $exec
+ undef %10.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 128, 0, implicit $exec
+ %10.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 192, 0, implicit $exec
+ undef %11.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 256, 0, implicit $exec
+ %11.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 320, 0, implicit $exec
+ undef %12.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 384, 0, implicit $exec
+ %12.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 448, 0, implicit $exec
+ undef %13.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 512, 0, implicit $exec
+ %13.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 576, 0, implicit $exec
+ undef %14.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 640, 0, implicit $exec
+ %14.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 704, 0, implicit $exec
+ undef %15.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 768, 0, implicit $exec
+ %15.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 832, 0, implicit $exec
+ %4:sreg_32_xm0_xexec = S_ADD_I32 %4:sreg_32_xm0_xexec, %3.sub1:sreg_64_xexec, implicit-def dead $scc
+ undef %16.sub0_sub1_sub2_sub3:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 896, 0, implicit $exec
+ %16.sub4_sub5_sub6_sub7:vreg_256_align2 = DS_LOAD_TR16_B128 %0:vgpr_32, 960, 0, implicit $exec
S_WAIT_DSCNT 5, implicit-def dead $tensorcnt, implicit $tensorcnt
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- early-clobber %2379:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2373:vreg_512_align2, %2369:vreg_512_align2, 8, 0, %2353.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %17:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %18:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %19:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ early-clobber %20:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5:vreg_512_align2, %6:vreg_512_align2, 8, 0, %7.sub0:vreg_128_lo256_align2, %8:vgpr_32_lo256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0, amdgpu_allvgprs
...
More information about the llvm-branch-commits
mailing list