[llvm-branch-commits] [llvm] [AMDGPU] Add stalls for DS FIFO buffer (PR #192323)
Jeffrey Byrnes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Apr 24 09:56:12 PDT 2026
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/192323
>From de914f9a03b7d9bd1b9bc30a904e7206330870df Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 6 Mar 2026 17:32:19 -0800
Subject: [PATCH 1/5] [AMDGPU] Add stalls for DS FIFO buffer
Change-Id: I73e56da97a931349e0655e4e20b24aeb97920647
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 46 +++++++-
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 47 +++++++-
llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll | 105 ++++++++++--------
3 files changed, 138 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index b4b6f18b0b45b..01f7a874d3073 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -137,10 +137,12 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
if (TotalCycles == 0)
return;
+ ScheduledSUs.push_back(SU);
AllSUs.remove(SU);
PrioritySUs.remove(SU);
- TotalCycles -= BlockingCycles;
+ if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0))
+ TotalCycles -= BlockingCycles;
if (AllSUs.empty())
return;
@@ -167,6 +169,14 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
}
+void HardwareUnitInfo::finalizeCycles() {
+ if (BufferSize <= 1 || !AllSUs.size())
+ return;
+
+ BufferCycles = TotalCycles / AllSUs.size();
+ TotalCycles /= BufferSize;
+}
+
HardwareUnitInfo *
CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
for (HardwareUnitInfo &HWUICand : HWUInfo) {
@@ -216,6 +226,7 @@ void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+ HWUInfo[(int)InstructionFlavor::DS].setBufferSize(DefaultBufferSizes::DS);
collectHWUIPressure();
}
@@ -229,6 +240,10 @@ void CandidateHeuristics::collectHWUIPressure() {
HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
}
+ for (auto &HWUI : HWUInfo) {
+ HWUI.finalizeCycles();
+ }
+
LLVM_DEBUG(dumpRegionSummary());
}
@@ -666,7 +681,26 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
SchedCandidate &TryCand,
- SchedBoundary &Zone) const {
+ SchedBoundary &Zone) {
+ auto getBufferFullStalls = [this,
+ &Zone](SUnit *SU) -> unsigned {
+ InstructionFlavor Flavor = classifyFlavor(
+ *SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
+ HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
+
+ if (HWUI->getBufferSize() <= 1)
+ return 0;
+
+ // getBufferAvailableCycle assumes top-down scheduling.
+ assert(Zone.isTop());
+ unsigned CurrCycle = Zone.getCurrCycle();
+ unsigned BufferReadyCycle = HWUI->getBufferAvailableCycle(CurrCycle);
+ if (BufferReadyCycle <= CurrCycle)
+ return 0;
+
+ return BufferReadyCycle - CurrCycle;
+ };
+
// Treat structural and latency stalls as a single scheduling cost for the
// current cycle.
struct StallCosts {
@@ -674,6 +708,7 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
unsigned Structural = 0;
unsigned Latency = 0;
unsigned Effective = 0;
+ unsigned Buffer = 0;
};
unsigned CurrCycle = Zone.getCurrCycle();
@@ -683,7 +718,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
Costs.Structural = getStructuralStallCycles(Zone, SU);
Costs.Latency = Zone.getLatencyStallCycles(SU);
- Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
+ Costs.Buffer = getBufferFullStalls(SU);
+ Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
return Costs;
};
@@ -693,10 +729,10 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
dbgs() << "Effective stalls: try=" << TryCosts.Effective
<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
- << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
+ << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer << ") cand=" << CandCosts.Effective
<< " (ready=" << CandCosts.Ready
<< ", struct=" << CandCosts.Structural
- << ", lat=" << CandCosts.Latency << ")\n";
+ << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer < ")\n";
});
return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index e8471540cbaed..9532a4ce1f8ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -20,6 +20,9 @@
namespace llvm {
namespace AMDGPU {
+namespace DefaultBufferSizes {
+constexpr unsigned DS = 16;
+} // namespace DefaultBufferSizes
//===----------------------------------------------------------------------===//
// Instruction Flavor Classification
@@ -163,6 +166,8 @@ class HardwareUnitInfo {
SmallSetVector<SUnit *, 16> PrioritySUs;
/// All the SUs in the region that consume this resource.
SmallSetVector<SUnit *, 16> AllSUs;
+ /// All the SUs for this HardwareUnit that have already been scheduled.
+ SmallVector<SUnit *, 16> ScheduledSUs;
/// The total number of busy cycles for this HardwareUnit for a given region.
unsigned TotalCycles = 0;
/// InstructionFlavor mapping.
@@ -172,6 +177,11 @@ class HardwareUnitInfo {
/// / MFMA instructions may take multiple cycles, which may be overlapped with
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
+ /// How many instructons can be held simultaneously for this HardwareUnit.
+ /// A value of 0 or 1 means that there is no buffer.
+ unsigned BufferSize = 0;
+ /// How many cycles it takes for an instruction to clear the buffer.
+ unsigned BufferCycles = 0;
public:
HardwareUnitInfo() {}
@@ -193,6 +203,24 @@ class HardwareUnitInfo {
bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
+ void setBufferSize(unsigned Size) { BufferSize = Size; }
+
+ unsigned getBufferSize() { return BufferSize; }
+
+ /// \returns the next cycle where there is space in the buffer.
+ unsigned getBufferAvailableCycle(unsigned CurrCycle) {
+ // There is no buffer.
+ if (BufferSize <= 1)
+ return CurrCycle;
+
+ // Buffer is available now.
+ if (ScheduledSUs.size() < BufferSize)
+ return CurrCycle;
+
+ return BufferCycles +
+ ScheduledSUs[ScheduledSUs.size() - BufferSize]->TopReadyCycle;
+ }
+
/// \returns the SUnit with higher priority or nullptr if they are the same.
/// This method looks through the PrioritySUs to determine if one SU is more
/// prioritized than the other. If neither are in the PrioritySUs list, then
@@ -214,6 +242,8 @@ class HardwareUnitInfo {
TotalCycles = 0;
Type = AMDGPU::InstructionFlavor::Other;
ProducesCoexecWindow = false;
+ BufferSize = 0;
+ BufferCycles = 0;
}
/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
@@ -233,6 +263,11 @@ class HardwareUnitInfo {
/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
/// the list of PrioritySUs.
void markScheduled(SUnit *SU, unsigned BlockingCycles);
+ /// After we've collected all the region pressure for this HWUI, correct for
+ /// any specifics of the behavior of this resource. For example, if we the
+ /// HardwareUnit can hold N instructions simultaneously, then there is no
+ /// penalty for scheduling N instructions back to back.
+ void finalizeCycles();
};
//===----------------------------------------------------------------------===//
@@ -257,10 +292,6 @@ class CandidateHeuristics {
/// SU.
unsigned getHWUICyclesForInst(SUnit *SU);
- /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
- /// mapped HardwareUnit.
- HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
-
public:
CandidateHeuristics() = default;
@@ -270,7 +301,11 @@ class CandidateHeuristics {
/// Update the state to reflect that \p SU is going to be scheduled.
void updateForScheduling(SUnit *SU);
- /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
+ /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
+ /// mapped HardwareUnit.
+ HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
+
+ /// Sort the HardwarUnitInfo vector. After sorting, the HWUI that are highest
/// priority are first. Priority is determined by maximizing coexecution and
/// keeping the critical HardwareUnit busy.
void sortHWUIResources();
@@ -299,7 +334,7 @@ class CandidateHeuristics {
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
- SchedBoundary &Zone) const;
+ SchedBoundary &Zone);
AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
CandidateHeuristics Heurs;
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
index c1e7bc005998c..831e342df1250 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -287,6 +287,11 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: ; =>This Inner Loop Header: Depth=1
; COEXEC-NEXT: s_add_co_i32 s7, s2, s6
; COEXEC-NEXT: s_add_co_i32 s8, s3, s6
+; COEXEC-NEXT: v_nop
+; COEXEC-NEXT: v_nop
+; COEXEC-NEXT: v_nop
+; COEXEC-NEXT: v_nop
+; COEXEC-NEXT: v_dual_mov_b32 v92, s7 :: v_dual_mov_b32 v108, s8
; COEXEC-NEXT: s_add_co_i32 s6, s6, s1
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
@@ -294,63 +299,65 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_mov_b32_e32 v124, s7
; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; COEXEC-NEXT: v_mov_b32_e32 v156, s8
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v124
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v124 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v156
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v156 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v124 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v156 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v124 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v156 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v124 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v156 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v124 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v156 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v124 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v156 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v124 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v156 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v124 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v124 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[112:115], v124 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[120:123], v124 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[128:131], v156 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[136:139], v156 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[144:147], v156 offset:640
-; COEXEC-NEXT: s_wait_dscnt 0x13
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v92
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v92 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v108 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v108 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v108 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v108 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v92 offset:768
+; COEXEC-NEXT: s_wait_dscnt 0x9
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: ds_load_tr16_b128 v[152:155], v156 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v124 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v124 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[116:119], v124 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[124:127], v124 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[132:135], v156 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[140:143], v156 offset:448
-; COEXEC-NEXT: s_wait_dscnt 0x16
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
-; COEXEC-NEXT: ds_load_tr16_b128 v[148:151], v156 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[156:159], v156 offset:960
-; COEXEC-NEXT: s_wait_dscnt 0x14
+; COEXEC-NEXT: s_wait_dscnt 0x5
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT: s_wait_dscnt 0x1
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: s_wait_dscnt 0x10
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_3)
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v92 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v108 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v108 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v92 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v92 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v108 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v108 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v108 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v108 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v92 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v92 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v108 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v108 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v108 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v108 offset:960
+; COEXEC-NEXT: s_wait_dscnt 0x10
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[32:39], v[0:7]
+; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[32:39], v[0:7]
; COEXEC-NEXT: s_wait_dscnt 0x3
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[72:79], v[24:31]
; COEXEC-NEXT: s_wait_dscnt 0x2
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[88:95], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x1
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[56:63], v[96:103], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[64:71], v[104:111], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[72:79], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[88:95], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[56:63], v[96:103], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[64:71], v[104:111], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB1_1
; COEXEC-NEXT: ; %bb.2: ; %end
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
>From 4a4e98f703beac31df8c69889b8f3d21ec18476c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 13 Mar 2026 14:29:33 -0700
Subject: [PATCH 2/5] Typo
Change-Id: I8b8da8a07be84506483f474d0a5e10ad79178c15
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 9532a4ce1f8ed..0ec78f6caaa1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -177,7 +177,7 @@ class HardwareUnitInfo {
/// / MFMA instructions may take multiple cycles, which may be overlapped with
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
- /// How many instructons can be held simultaneously for this HardwareUnit.
+ /// How many instructions can be held simultaneously for this HardwareUnit.
/// A value of 0 or 1 means that there is no buffer.
unsigned BufferSize = 0;
/// How many cycles it takes for an instruction to clear the buffer.
>From a21d49278b7456f7e3b4bd1a51d061ff85864e36 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Mar 2026 15:37:51 -0700
Subject: [PATCH 3/5] Merge conflicts
Change-Id: I33564a1e5d14f3b53577cb463ba2cb3a7993fd24
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 7 +-
llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll | 105 ++++++++----------
2 files changed, 53 insertions(+), 59 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 01f7a874d3073..ec0727e1d1bd7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -729,10 +729,11 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
dbgs() << "Effective stalls: try=" << TryCosts.Effective
<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
- << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer << ") cand=" << CandCosts.Effective
- << " (ready=" << CandCosts.Ready
+ << ", lat=" << TryCosts.Latency << ", buffer=" << TryCosts.Buffer
+ << ") cand=" << CandCosts.Effective << " (ready=" << CandCosts.Ready
<< ", struct=" << CandCosts.Structural
- << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer < ")\n";
+ << ", lat=" << CandCosts.Latency << ", buffer=" << CandCosts.Buffer
+ << ")\n";
});
return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
index 831e342df1250..c1e7bc005998c 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -287,11 +287,6 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: ; =>This Inner Loop Header: Depth=1
; COEXEC-NEXT: s_add_co_i32 s7, s2, s6
; COEXEC-NEXT: s_add_co_i32 s8, s3, s6
-; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_nop
-; COEXEC-NEXT: v_dual_mov_b32 v92, s7 :: v_dual_mov_b32 v108, s8
; COEXEC-NEXT: s_add_co_i32 s6, s6, s1
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_nop
@@ -299,65 +294,63 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
; COEXEC-NEXT: v_nop
; COEXEC-NEXT: v_mov_b32_e32 v124, s7
; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v92
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v108
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v92 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v108 offset:64
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v108 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:256
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v108 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:320
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v108 offset:512
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v108 offset:576
-; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v92 offset:768
-; COEXEC-NEXT: s_wait_dscnt 0x9
+; COEXEC-NEXT: v_mov_b32_e32 v156, s8
+; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v124
+; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v124 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v156
+; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v156 offset:64
+; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v124 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v156 offset:256
+; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v124 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v156 offset:320
+; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v124 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v156 offset:512
+; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v124 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v156 offset:576
+; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v124 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v156 offset:768
+; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v124 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v156 offset:832
+; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v124 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v124 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[112:115], v124 offset:640
+; COEXEC-NEXT: ds_load_tr16_b128 v[120:123], v124 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[128:131], v156 offset:128
+; COEXEC-NEXT: ds_load_tr16_b128 v[136:139], v156 offset:384
+; COEXEC-NEXT: ds_load_tr16_b128 v[144:147], v156 offset:640
+; COEXEC-NEXT: s_wait_dscnt 0x13
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: s_wait_dscnt 0x5
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
-; COEXEC-NEXT: s_wait_dscnt 0x1
+; COEXEC-NEXT: ds_load_tr16_b128 v[152:155], v156 offset:896
+; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v124 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v124 offset:448
+; COEXEC-NEXT: ds_load_tr16_b128 v[116:119], v124 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[124:127], v124 offset:960
+; COEXEC-NEXT: ds_load_tr16_b128 v[132:135], v156 offset:192
+; COEXEC-NEXT: ds_load_tr16_b128 v[140:143], v156 offset:448
+; COEXEC-NEXT: s_wait_dscnt 0x16
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT: ds_load_tr16_b128 v[148:151], v156 offset:704
+; COEXEC-NEXT: ds_load_tr16_b128 v[156:159], v156 offset:960
+; COEXEC-NEXT: s_wait_dscnt 0x14
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; COEXEC-NEXT: s_wait_dscnt 0x10
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
-; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
-; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v92 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v108 offset:768
-; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v108 offset:832
-; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v92 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v92 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v108 offset:128
-; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v108 offset:384
-; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v108 offset:640
-; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v108 offset:896
-; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v92 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v92 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:960
-; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v108 offset:192
-; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v108 offset:448
-; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v108 offset:704
-; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v108 offset:960
-; COEXEC-NEXT: s_wait_dscnt 0x10
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[32:39], v[0:7]
-; COEXEC-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[32:39], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
; COEXEC-NEXT: s_wait_dscnt 0x3
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[72:79], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
; COEXEC-NEXT: s_wait_dscnt 0x2
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[88:95], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
; COEXEC-NEXT: s_wait_dscnt 0x1
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[56:63], v[96:103], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
; COEXEC-NEXT: s_wait_dscnt 0x0
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[64:71], v[104:111], v[0:7]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[72:79], v[24:31]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[88:95], v[16:23]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[56:63], v[96:103], v[8:15]
-; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[64:71], v[104:111], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
; COEXEC-NEXT: s_cbranch_vccnz .LBB1_1
; COEXEC-NEXT: ; %bb.2: ; %end
; COEXEC-NEXT: v_mov_b32_e32 v32, 0
>From a18260b1a0d1f7ccb05e64a2a42f550270c0ca0e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 23 Mar 2026 15:59:16 -0700
Subject: [PATCH 4/5] Claude Code review
Change-Id: Id4983ca59270c8bb2d261d38a6e7f2483c9d237e
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 22 +++++++++++++++----
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 1 +
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index ec0727e1d1bd7..093afb08629ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -170,10 +170,24 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
void HardwareUnitInfo::finalizeCycles() {
- if (BufferSize <= 1 || !AllSUs.size())
+ if (BufferSize <= 1 || AllSUs.empty())
return;
+ // We estimate the amount of cycles it takes to free up a slot in the buffer
+ // as the average cycles per SU.
BufferCycles = TotalCycles / AllSUs.size();
+ // The TotalCycles is normalized against the BufferSize.
+ // This provides an estimate of the TotalCycles which is not always accurate
+ // -- particularly in cases where we have fewer instructions than the
+ // BufferSize. For example, if we have 2 instructions which each take 50
+ // cycles and a BufferSize of 16, then a TotalCycles of 51 cycles would be
+ // somewhat accurate. This normalization calculates TotalCycles as 6. However,
+ // if we have 64 of these instructions, our normalized estimate of 200 is more
+ // reasonable, given the more accurate measure is 264. Having a completely
+ // accurate measure is not very important, since this metric is mainly used to
+ // compare the relative demand per HardwareUnit across the region. The simpler
+ // estimate makes managing the metric incrementally during scheduling much
+ // simpler.
TotalCycles /= BufferSize;
}
@@ -682,8 +696,7 @@ bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary &Zone) {
- auto getBufferFullStalls = [this,
- &Zone](SUnit *SU) -> unsigned {
+ auto getBufferFullStalls = [this, &Zone](SUnit *SU) -> unsigned {
InstructionFlavor Flavor = classifyFlavor(
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
@@ -719,7 +732,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
Costs.Structural = getStructuralStallCycles(Zone, SU);
Costs.Latency = Zone.getLatencyStallCycles(SU);
Costs.Buffer = getBufferFullStalls(SU);
- Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
+ Costs.Effective =
+ std::max({Costs.Ready, Costs.Structural, Costs.Latency, Costs.Buffer});
return Costs;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 0ec78f6caaa1c..198e9b007fa11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -239,6 +239,7 @@ class HardwareUnitInfo {
void reset() {
AllSUs.clear();
PrioritySUs.clear();
+ ScheduledSUs.clear();
TotalCycles = 0;
Type = AMDGPU::InstructionFlavor::Other;
ProducesCoexecWindow = false;
>From 9e4ef195dc3017fa3e4a86092e60b018900c5b53 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 24 Apr 2026 09:29:10 -0700
Subject: [PATCH 5/5] Address Review comments
Change-Id: I6972e887edd5db44ee9bcaed1f79e0c9933f611e
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 7 ++++++-
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 20 ++++++++++++++++---
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 093afb08629ec..8d550f5078b54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -141,6 +141,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
AllSUs.remove(SU);
PrioritySUs.remove(SU);
+ // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+ // BlockingCycles
if (BufferSize <= 1 || (ScheduledSUs.size() % BufferSize == 0))
TotalCycles -= BlockingCycles;
@@ -170,6 +172,8 @@ void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
}
void HardwareUnitInfo::finalizeCycles() {
+ // BufferSize of 0 or 1 implies that each SU uses the HardwareUnit for
+ // BlockingCycles
if (BufferSize <= 1 || AllSUs.empty())
return;
@@ -701,7 +705,8 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
*SU->getInstr(), *static_cast<const SIInstrInfo *>(DAG->TII));
HardwareUnitInfo *HWUI = Heurs.getHWUIFromFlavor(Flavor);
- if (HWUI->getBufferSize() <= 1)
+ // A BufferSize of 0 means "unlimited" buffer, thus we will never fill it.
+ if (HWUI->getBufferSize() == 0)
return 0;
// getBufferAvailableCycle assumes top-down scheduling.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 198e9b007fa11..fd637e9f8efce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -178,9 +178,23 @@ class HardwareUnitInfo {
/// instructions on other HardwareUnits.
bool ProducesCoexecWindow = false;
/// How many instructions can be held simultaneously for this HardwareUnit.
- /// A value of 0 or 1 means that there is no buffer.
+ /// A value of 0 means there is no limit.
+ ///
+ /// This may approximate the hardware. For example, for LDS instructions
+ /// it is a well-known phenomena that oversubscribing the LDS unit results in
+ /// longer latency for the LDS instructions. While it is true that there is a
+ /// hard limit to the amount of simulatenous in-flight LDS instructions, good
+ /// scheduling would also cool off the LDS to avoid other forms of hardware
+ /// contention and increasing LDS latency. Thus, we limit the amount of LDS
+ /// instructions we are willing to schedule close together, though this does
+ /// not correspond 1:1 with a hardware mechanism.
unsigned BufferSize = 0;
/// How many cycles it takes for an instruction to clear the buffer.
+ ///
+ /// Again, this may be an apprxoimation. For example, for memory FIFOs, the
+ /// actual amount of cycles it will take to clear it is dependent on how
+ /// quickly prior instructions evacuate the FIFO, which is based on runtime
+ /// behavior which is not modelled in the compiler.
unsigned BufferCycles = 0;
public:
@@ -210,7 +224,7 @@ class HardwareUnitInfo {
/// \returns the next cycle where there is space in the buffer.
unsigned getBufferAvailableCycle(unsigned CurrCycle) {
// There is no buffer.
- if (BufferSize <= 1)
+ if (BufferSize == 0)
return CurrCycle;
// Buffer is available now.
@@ -265,7 +279,7 @@ class HardwareUnitInfo {
/// the list of PrioritySUs.
void markScheduled(SUnit *SU, unsigned BlockingCycles);
/// After we've collected all the region pressure for this HWUI, correct for
- /// any specifics of the behavior of this resource. For example, if we the
+ /// any specifics of the behavior of this resource. For example, if the
/// HardwareUnit can hold N instructions simultaneously, then there is no
/// penalty for scheduling N instructions back to back.
void finalizeCycles();
More information about the llvm-branch-commits
mailing list