[llvm-branch-commits] [llvm] [AMDGPU] Add stalls for DS FIFO buffer (PR #192323)
Jeffrey Byrnes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu May 28 15:46:55 PDT 2026
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/192323
>From ad62b62453d0ed6f27939ddd5b40fe76078160a5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 28 May 2026 09:59:42 -0700
Subject: [PATCH 1/6] [AMDGPU] Add stalls for DS FIFO buffer
Change-Id: I73e56da97a931349e0655e4e20b24aeb97920647
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 46 +++++-
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 47 +++++-
llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll | 139 +++++++++---------
3 files changed, 151 insertions(+), 81 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 18cc341969309..fadc5cdf5a1ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -137,10 +137,12 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
if (TotalCycles == 0)
return;
+ ScheduledSUs.push_back(SU);
AllSUs.remove(SU);
PrioritySUs.remove(SU);
- TotalCycles -= BlockingCycles;
+ if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0))
+ TotalCycles -= BlockingCycles;
if (AllSUs.empty())
return;
@@ -167,6 +169,14 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
}
+void HardwareUnitInfo::finalizeCycles() {
+ if (BufferSize <= 1 || !AllSUs.size())
+ return;
+
+ BufferCycles = TotalCycles / AllSUs.size();
+ TotalCycles /= BufferSize;
+}
+
HardwareUnitInfo *
CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
for (HardwareUnitInfo &HWUICand : HWUInfo) {
@@ -216,6 +226,7 @@ void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+ HWUInfo[(int)InstructionFlavor::DS].setBufferSize(DefaultBufferSizes::DS);
collectHWUIPressure();
}
@@ -229,6 +240,10 @@ void CandidateHeuristics::collectHWUIPressure() {
HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
}
+ for (auto &HWUI : HWUInfo) {
+ HWUI.finalizeCycles();
+ }
+
LLVM_DEBUG(dumpRegionSummary());
}
@@ -668,7 +683,26 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
SchedCandidate &TryCand,
- SchedBoundary &Zone) const {
+ SchedBoundary &Zone) {
+ auto getBufferFullStalls = [this,
+ &Zone](SUnit *SU) -> unsigned {
+ InstructionFlavor Flavor = classifyFlavor(
+ *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
+ HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
+
+ if (HWUI->getBufferSize() <= 1)
+ return 0;
+
+ // getBufferAvailableCycle assumes top-down scheduling.
+ assert(Zone.isTop());
+ unsigned CurrCycle = Zone.getCurrCycle();
+ unsigned BufferReadyCycle = HWUI->getBufferAvailableCycle(CurrCycle);
+ if (BufferReadyCycle <= CurrCycle)
+ return 0;
+
+ return BufferReadyCycle - CurrCycle;
+ };
+
// Treat structural and latency stalls as a single scheduling cost for the
// current cycle.
struct StallCosts {
@@ -676,6 +710,7 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
unsigned Structural = 0;
unsigned Latency = 0;
unsigned Effective = 0;
+ unsigned Buffer = 0;
};
unsigned CurrCycle = Zone.getCurrCycle();
@@ -685,7 +720,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
Costs.Structural = getStructuralStallCycles(Zone, SU);
Costs.Latency = Zone.getLatencyStallCycles(SU);
- Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
+ Costs.Buffer = getBufferFullStalls(SU);
+ Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
return Costs;
};
@@ -695,10 +731,10 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
dbgs() << "Effective stalls: try=" << TryCosts.Effective
<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
- << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
+ << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer << ") cand=" << CandCosts.Effective
<< " (ready=" << CandCosts.Ready
<< ", struct=" << CandCosts.Structural
- << ", lat=" << CandCosts.Latency << ")\n";
+ << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer << ")\n";
});
return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index e8471540cbaed..9532a4ce1f8ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -20,6 +20,9 @@
namespace llvm {
namespace AMDGPU {
+namespace DefaultBufferSizes {
+constexpr unsigned DS = 16;
+} // namespace DefaultBufferSizes
//===----------------------------------------------------------------------===//
// Instruction Flavor Classification
@@ -163,6 +166,8 @@ class HardwareUnitInfo {
SmallSetVector<SUnit *, 16> PrioritySUs;
/// All the SUs in the region that consume this resource.
SmallSetVector<SUnit *, 16> AllSUs;
+ /// All the SUs for this HardwareUnit that have already been scheduled.
+ SmallVector<SUnit *, 16> ScheduledSUs;
/// The total number of busy cycles for this HardwareUnit for a given region.
unsigned TotalCycles = 0;
/// InstructionFlavor mapping.
@@ -172,6 +177,11 @@ class HardwareUnitInfo {
/// / MFMA instructions may take multiple cycles, which may be overlapped with
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
+ /// How many instructons can be held simultaneously for this HardwareUnit.
+ /// A value of 0 or 1 means that there is no buffer.
+ unsigned BufferSize = 0;
+ /// How many cycles it takes for an instruction to clear the buffer.
+ unsigned BufferCycles = 0;
public:
HardwareUnitInfo() {}
@@ -193,6 +203,24 @@ class HardwareUnitInfo {
bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
+ void setBufferSize(unsigned Size) { BufferSize = Size; }
+
+ unsigned getBufferSize() { return BufferSize; }
+
+ /// \returns the next cycle where there is space in the buffer.
+ unsigned getBufferAvailableCycle(unsigned CurrCycle) {
+ // There is no buffer.
+ if (BufferSize <= 1)
+ return CurrCycle;
+
+ // Buffer is available now.
+ if (ScheduledSUs.size() < BufferSize)
+ return CurrCycle;
+
+ return BufferCycles +
+ ScheduledSUs[ScheduledSUs.size() - BufferSize]->TopReadyCycle;
+ }
+
/// \returns the SUnit with higher priority or nullptr if they are the same.
/// This method looks through the PrioritySUs to determine if one SU is more
/// prioritized than the other. If neither are in the PrioritySUs list, then
@@ -214,6 +242,8 @@ class HardwareUnitInfo {
TotalCycles = 0;
Type = AMDGPU::InstructionFlavor::Other;
ProducesCoexecWindow = false;
+ BufferSize = 0;
+ BufferCycles = 0;
}
/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
@@ -233,6 +263,11 @@ class HardwareUnitInfo {
/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
/// the list of PrioritySUs.
void markScheduled(SUnit *SU, unsigned BlockingCycles);
+ /// After we've collected all the region pressure for this HWUI, correct for
+ /// any specifics of the behavior of this resource. For example, if we the
+ /// HardwareUnit can hold N instructions simultaneously, then there is no
+ /// penalty for scheduling N instructions back to back.
+ void finalizeCycles();
};
//===----------------------------------------------------------------------===//
@@ -257,10 +292,6 @@ class CandidateHeuristics {
/// SU.
unsigned getHWUICyclesForInst(SUnit *SU);
- /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
- /// mapped HardwareUnit.
- HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
-
public:
CandidateHeuristics() = default;
@@ -270,7 +301,11 @@ class CandidateHeuristics {
/// Update the state to reflect that \p SU is going to be scheduled.
void updateForScheduling(SUnit *SU);
- /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
+ /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
+ /// mapped HardwareUnit.
+ HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
+
+ /// Sort the HardwarUnitInfo vector. After sorting, the HWUI that are highest
/// priority are first. Priority is determined by maximizing coexecution and
/// keeping the critical HardwareUnit busy.
void sortHWUIResources();
@@ -299,7 +334,7 @@ class CandidateHeuristics {
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
- SchedBoundary &Zone) const;
+ SchedBoundary &Zone);
AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
CandidateHeuristics Heurs;
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
index ac121195de432..b2fa3adeb6736 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -41,39 +41,38 @@ define amdgpu_kernel void @ds_wmma(ptr addrspace(3) %base, ptr addrspace(1) %out
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_mov_b32_e32 v88, s2
+; COEXEC-NEXT: v_mov_b32_e32 v80, s2
; COEXEC-NEXT: s_add_co_i32 s2, s2, s1
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v88 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v88
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v88 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v88 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v88 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v88 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v88 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v88 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v88 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v88 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v88 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v88 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v88 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v88 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v88 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v88 offset:768
-; COEXEC-NEXT: s_wait_dscnt 0xc
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v80 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v80
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v80 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v80 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v80 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v80 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v80 offset:256
+; COEXEC-NEXT: s_wait_dscnt 0x3
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v80 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v80 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v80 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v80 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v80 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v80 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v80 offset:768
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v80 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v80 offset:896
; COEXEC-NEXT: s_wait_dscnt 0x8
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x4
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB0_1
; COEXEC-NEXT: ; %bb.2: ; %end
-; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
; COEXEC-NEXT: s_wait_kmcnt 0x0
@@ -285,65 +284,65 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_mov_b32_e32 v124, s7
+; COEXEC-NEXT: v_mov_b32_e32 v76, s7
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:64
; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; COEXEC-NEXT: v_mov_b32_e32 v156, s8
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v124
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v124 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v156
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v156 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v124 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v124 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v156 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v156 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v124 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v124 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v156 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v156 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v124 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v124 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v156 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v156 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v124 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v124 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v124 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v124 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[112:115], v124 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[116:119], v124 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[120:123], v124 offset:896
-; COEXEC-NEXT: s_wait_dscnt 0x13
+; COEXEC-NEXT: v_mov_b32_e32 v108, s8
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v76 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v76 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v108 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v108 offset:256
+; COEXEC-NEXT: s_wait_dscnt 0x4
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: ds_load_tr16_b128 v[124:127], v124 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[128:131], v156 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[132:135], v156 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[136:139], v156 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[140:143], v156 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[144:147], v156 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[148:151], v156 offset:704
-; COEXEC-NEXT: s_wait_dscnt 0x16
+; COEXEC-NEXT: s_wait_dscnt 0x0
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
-; COEXEC-NEXT: ds_load_tr16_b128 v[152:155], v156 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[156:159], v156 offset:960
-; COEXEC-NEXT: s_wait_dscnt 0x14
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: s_wait_dscnt 0x10
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:576
+; COEXEC-NEXT: s_wait_dscnt 0x0
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[32:39], v[40:47], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[32:39], v[40:47], v[8:15]
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v76 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v76 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v76 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v76 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v76 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v76 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v76 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v76 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v108 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v108 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v108 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v108 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v108 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v108 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v108 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v108 offset:960
+; COEXEC-NEXT: s_wait_dscnt 0x10
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[32:39], v[40:47], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[32:39], v[40:47], v[0:7]
; COEXEC-NEXT: s_wait_dscnt 0x6
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[48:55], v[80:87], v[24:31]
; COEXEC-NEXT: s_wait_dscnt 0x4
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[88:95], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x2
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[96:103], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[72:79], v[104:111], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[48:55], v[80:87], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[88:95], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[96:103], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[72:79], v[104:111], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB1_1
; COEXEC-NEXT: ; %bb.2: ; %end
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
>From 03dd7dd371c3c0cdd7e55d8433ae22c31feac30a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 13 Mar 2026 14:29:33 -0700
Subject: [PATCH 2/6] Typo
Change-Id: I8b8da8a07be84506483f474d0a5e10ad79178c15
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 9532a4ce1f8ed..0ec78f6caaa1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -177,7 +177,7 @@ class HardwareUnitInfo {
/// / MFMA instructions may take multiple cycles, which may be overlapped with
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
- /// How many instructons can be held simultaneously for this HardwareUnit.
+ /// How many instructions can be held simultaneously for this HardwareUnit.
/// A value of 0 or 1 means that there is no buffer.
unsigned BufferSize = 0;
/// How many cycles it takes for an instruction to clear the buffer.
>From e6d4d894a7c28edfa47171fbcd76090d689f8a8e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Mar 2026 15:37:51 -0700
Subject: [PATCH 3/6] Merge conflicts
Change-Id: I33564a1e5d14f3b53577cb463ba2cb3a7993fd24
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index fadc5cdf5a1ea..fefcf1ebf6cd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -731,10 +731,11 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
dbgs() << "Effective stalls: try=" << TryCosts.Effective
<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
- << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer << ") cand=" << CandCosts.Effective
- << " (ready=" << CandCosts.Ready
+ << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer
+ << ") cand=" << CandCosts.Effective << " (ready=" << CandCosts.Ready
<< ", struct=" << CandCosts.Structural
- << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer << ")\n";
+ << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer
+ << ")\n";
});
return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
>From e245beee42269dd54163f1dc280b047aeb4fd267 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Mar 2026 15:59:16 -0700
Subject: [PATCH 4/6] Claude Code review
Change-Id: Id4983ca59270c8bb2d261d38a6e7f2483c9d237e
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 22 +++++++++++++++----
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 1 +
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index fefcf1ebf6cd3..22efa732e6d6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -170,10 +170,24 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
void HardwareUnitInfo::finalizeCycles() {
- if (BufferSize <= 1 || !AllSUs.size())
+ if (BufferSize <= 1 || AllSUs.empty())
return;
+ // We estimate the amount of cycles it takes to free up a slot in the buffer
+ // as the average cycles per SU.
BufferCycles = TotalCycles / AllSUs.size();
+ // The TotalCycles is normalized against the BufferSize.
+ // This provides an estimate of the TotalCycles which is not always accurate
+ // -- particularly in cases where we have fewer instructions than the
+ // BufferSize. For example, if we have 2 instructions which each take 50
+ // cycles and a BufferSize of 16, then a TotalCycles of 51 cycles would be
+ // somewhat accurate. This normalization calculates TotalCycles as 6. However,
+ // if we have 64 of these instructions, our normalized estimate of 200 is more
+ // reasonable, given the more accurate measure is 264. Having a completely
+ // accurate measure is not very important, since this metric is mainly used to
+ // compare the relative demand per HardwareUnit across the region. The simpler
+ // estimate makes managing the metric incrementally during scheduling much
+ // simpler.
TotalCycles /= BufferSize;
}
@@ -684,8 +698,7 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary &Zone) {
- auto getBufferFullStalls = [this,
- &Zone](SUnit *SU) -> unsigned {
+ auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
InstructionFlavor Flavor = classifyFlavor(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
@@ -721,7 +734,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
Costs.Structural = getStructuralStallCycles(Zone, SU);
Costs.Latency = Zone.getLatencyStallCycles(SU);
Costs.Buffer = getBufferFullStalls(SU);
- Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
+ Costs.Effective =
+ std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
return Costs;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 0ec78f6caaa1c..198e9b007fa11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -239,6 +239,7 @@ class HardwareUnitInfo {
void reset() {
AllSUs.clear();
PrioritySUs.clear();
+ ScheduledSUs.clear();
TotalCycles = 0;
Type = AMDGPU::InstructionFlavor::Other;
ProducesCoexecWindow = false;
>From 697208ec098cfcd0ff51af6b63b0243b20acf41b Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 24 Apr 2026 09:29:10 -0700
Subject: [PATCH 5/6] Address Review comments
Change-Id: I6972e887edd5db44ee9bcaed1f79e0c9933f611e
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 7 ++++++-
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 20 ++++++++++++++++---
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 22efa732e6d6f..9e73a6a526047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -141,6 +141,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
AllSUs.remove(SU);
PrioritySUs.remove(SU);
+ // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+ // BlockingCycles
if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0))
TotalCycles -= BlockingCycles;
@@ -170,6 +172,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
void HardwareUnitInfo::finalizeCycles() {
+ // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+ // BlockingCycles
if (BufferSize <= 1 || AllSUs.empty())
return;
@@ -703,7 +707,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
- if (HWUI->getBufferSize() <= 1)
+ // A BufferSize of 0 means "unlimited" buffer, thus we will never fill it.
+ if (HWUI->getBufferSize() == 0)
return 0;
// getBufferAvailableCycle assumes top-down scheduling.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 198e9b007fa11..fd637e9f8efce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -178,9 +178,23 @@ class HardwareUnitInfo {
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
/// How many instructions can be held simultaneously for this HardwareUnit.
- /// A value of 0 or 1 means that there is no buffer.
+ /// A value of 0 means there is no limit.
+ ///
+ /// This may approximate the hardware. For example, for LDS instructions
+ /// it is a well-known phenomena that oversubscribing the LDS unit results in
+ /// longer latency for the LDS instructions. While it is true that there is a
+ /// hard limit to the amount of simulatenous in-flight LDS instructions, good
+ /// scheduling would also cool off the LDS to avoid other forms of hardware
+ /// contention and increasing LDS latency. Thus, we limit the amount of LDS
+ /// instructions we are willing to schedule close together, though this does
+ /// not correspond 1:1 with a hardware mechanism.
unsigned BufferSize = 0;
/// How many cycles it takes for an instruction to clear the buffer.
+ ///
+ /// Again, this may be an apprxoimation. For example, for memory FIFOs, the
+ /// actual amount of cycles it will take to clear it is dependent on how
+ /// quickly prior instructions evacuate the FIFO, which is based on runtime
+ /// behavior which is not modelled in the compiler.
unsigned BufferCycles = 0;
public:
@@ -210,7 +224,7 @@ class HardwareUnitInfo {
/// \returns the next cycle where there is space in the buffer.
unsigned getBufferAvailableCycle(unsigned CurrCycle) {
// There is no buffer.
- if (BufferSize <= 1)
+ if (BufferSize == 0)
return CurrCycle;
// Buffer is available now.
@@ -265,7 +279,7 @@ class HardwareUnitInfo {
/// the list of PrioritySUs.
void markScheduled(SUnit *SU, unsigned BlockingCycles);
/// After we've collected all the region pressure for this HWUI, correct for
- /// any specifics of the behavior of this resource. For example, if we the
+ /// any specifics of the behavior of this resource. For example, if the
/// HardwareUnit can hold N instructions simultaneously, then there is no
/// penalty for scheduling N instructions back to back.
void finalizeCycles();
>From 0a24db3557f3ff5a8cab577d4d77f7a40b28bd4a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 28 May 2026 15:24:10 -0700
Subject: [PATCH 6/6] Update test
Change-Id: I5dde891c0eb765ca91a019161101b5ea391bd2e5
---
llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll | 139 ++++++++++---------
1 file changed, 70 insertions(+), 69 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
index b2fa3adeb6736..ac121195de432 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -41,38 +41,39 @@ define amdgpu_kernel void @ds_wmma(ptr addrspace(3) %base, ptr addrspace(1) %out
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_mov_b32_e32 v80, s2
+; COEXEC-NEXT: v_mov_b32_e32 v88, s2
; COEXEC-NEXT: s_add_co_i32 s2, s2, s1
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v80 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v80
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v80 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v80 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v80 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v80 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v80 offset:256
-; COEXEC-NEXT: s_wait_dscnt 0x3
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v88 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v88
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v88 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v88 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v88 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v88 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v88 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v88 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v88 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v88 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v88 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v88 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v88 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v88 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v88 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v88 offset:768
+; COEXEC-NEXT: s_wait_dscnt 0xc
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v80 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v80 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v80 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v80 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v80 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v80 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v80 offset:768
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v80 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v80 offset:896
; COEXEC-NEXT: s_wait_dscnt 0x8
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x4
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB0_1
; COEXEC-NEXT: ; %bb.2: ; %end
+; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
; COEXEC-NEXT: s_wait_kmcnt 0x0
@@ -284,65 +285,65 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_mov_b32_e32 v76, s7
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:64
+; COEXEC-NEXT: v_mov_b32_e32 v124, s7
; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; COEXEC-NEXT: v_mov_b32_e32 v108, s8
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v76 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v76 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v108 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v108 offset:256
-; COEXEC-NEXT: s_wait_dscnt 0x4
+; COEXEC-NEXT: v_mov_b32_e32 v156, s8
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v124
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v124 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v156
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v156 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v124 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v124 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v156 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v156 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v124 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v124 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v156 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v156 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v124 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v124 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v156 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v156 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v124 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v124 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v124 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v124 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[112:115], v124 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[116:119], v124 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[120:123], v124 offset:896
+; COEXEC-NEXT: s_wait_dscnt 0x13
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: s_wait_dscnt 0x0
+; COEXEC-NEXT: ds_load_tr16_b128 v[124:127], v124 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[128:131], v156 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[132:135], v156 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[136:139], v156 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[140:143], v156 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[144:147], v156 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[148:151], v156 offset:704
+; COEXEC-NEXT: s_wait_dscnt 0x16
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT: ds_load_tr16_b128 v[152:155], v156 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[156:159], v156 offset:960
+; COEXEC-NEXT: s_wait_dscnt 0x14
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT: s_wait_dscnt 0x10
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:576
-; COEXEC-NEXT: s_wait_dscnt 0x0
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[32:39], v[40:47], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[32:39], v[40:47], v[8:15]
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v76 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v76 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v76 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v76 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v76 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v76 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v76 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v76 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v76 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v76 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v108 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v108 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v108 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v108 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v108 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v108 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v108 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v108 offset:960
-; COEXEC-NEXT: s_wait_dscnt 0x10
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[32:39], v[40:47], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[32:39], v[40:47], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
; COEXEC-NEXT: s_wait_dscnt 0x6
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[48:55], v[80:87], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
; COEXEC-NEXT: s_wait_dscnt 0x4
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[88:95], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x2
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[96:103], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[72:79], v[104:111], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[48:55], v[80:87], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[88:95], v[16:23]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[96:103], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[72:79], v[104:111], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB1_1
; COEXEC-NEXT: ; %bb.2: ; %end
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
More information about the llvm-branch-commits
mailing list