[llvm-branch-commits] [llvm] [AMDGPU] DS loop wait relaxation -- more test cases and improvements … (PR #171952)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 11 19:01:15 PST 2025
https://github.com/hidekisaito created https://github.com/llvm/llvm-project/pull/171952
…to handle them (4/4)
Add handling for same-iteration use/overwrite of DS load results:
- Track DS load destinations and detect when results are used or overwritten within the same iteration
- Compute FloorWaitCount for WMMAs that only use flushed loads Add bailout for tensor_load_to_lds and LDS DMA writes after barrier Add negative test based on profitability criteria
Assisted-by: Cursor / claude-4.5-opus-high
Depends on https://github.com/llvm/llvm-project/pull/171948
>From 238a970d621ed4b0758d8042ec30ed89895c4c3c Mon Sep 17 00:00:00 2001
From: Hideki Saito <hidekido at amd.com>
Date: Thu, 11 Dec 2025 21:56:17 -0500
Subject: [PATCH] [AMDGPU] DS loop wait relaxation -- more test cases and
improvements to handle them (4/4)
Add handling for same-iteration use/overwrite of DS load results:
- Track DS load destinations and detect when results are used or
overwritten within the same iteration
- Compute FloorWaitCount for WMMAs that only use flushed loads
Add bailout for tensor_load_to_lds and LDS DMA writes after barrier
Add negative test based on profitability criteria
Assisted-by: Cursor / claude-4.5-opus-high
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 99 +++++++++++++++-
.../AMDGPU/waitcnt-loop-ds-opt-eligible.mir | 2 +-
.../waitcnt-loop-ds-opt-no-improvement.mir | 109 +++++++++++++++++
...aitcnt-loop-ds-opt-same-iter-overwrite.mir | 111 ++++++++++++++++++
.../waitcnt-loop-ds-opt-same-iter-use.mir | 107 +++++++++++++++++
.../waitcnt-loop-ds-opt-tensor-load.mir | 97 +++++++++++++++
6 files changed, 518 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-use.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-tensor-load.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 28bc57ed2db4e..55c0d72c125af 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -468,6 +468,11 @@ class SIInsertWaitcnts {
mutable bool RelaxationApplied = false;
// Pointer to the last barrier in the loop (found during eligibility check)
const MachineInstr *LastBarrier = nullptr;
+ // The wait count "floor" established by same-iteration uses/overwrites.
+ // When a DS load result is used in the same iteration, the baseline inserts
+ // a wait. This floor indicates the expected counter state after that wait.
+ // WMMAs that only use flushed loads can rely on this floor.
+ unsigned FloorWaitCount = 0;
};
// Cache of loop DS wait optimization info, keyed by loop header MBB.
@@ -2775,9 +2780,21 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
// if one exists. LastBarrier was already found during eligibility check.
// These are likely to be prefetch loads whose results are used in the next
// iteration.
+ //
+ // If a load result is used or overwritten within the same iteration, the
+ // baseline will insert a wait before that instruction. Since DS loads
+ // complete in FIFO order, that wait also completes all earlier loads. So we
+ // can drop those "flushed" loads from our tracking and only consider
+ // subsequent loads as true prefetch loads. Overwrites also require the load
+ // to complete first to avoid write-after-write races.
const MachineInstr *LastBarrier = Info.LastBarrier;
+ // Single pass: track DS load destinations, handle uses (which flush prior
+ // loads) and detect overwrites (which invalidate our analysis).
+ // TrackedLoads: (Register, Position) pairs for checking uses/overwrites
+ SmallVector<std::pair<Register, unsigned>, 64> TrackedLoads;
unsigned LoadPosition = 0;
+ unsigned LastFlushedPosition = 0; // Loads up to this position will be flushed
bool AfterLastBarrier = (LastBarrier == nullptr); // If no barrier, track all
for (const MachineInstr &MI : *MBB) {
@@ -2789,6 +2806,42 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
if (!AfterLastBarrier)
continue;
+ // Check for instructions that write to LDS through DMA (global_load_lds,
+ // etc). These write to LDS but aren't DS instructions.
+ // Bail out if any appear after the barrier.
+ if (SIInstrInfo::mayWriteLDSThroughDMA(MI)) {
+ LLVM_DEBUG(
+ dbgs() << "Loop DS Wait Opt: LDS DMA write after last barrier, "
+ << "skipping\n");
+ Info.Valid = false;
+ return;
+ }
+
+ // Check for tensor_load_to_lds instructions (MIMG, not caught by above)
+ if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
+ MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2) {
+ LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: tensor_load_to_lds after last "
+ << "barrier, skipping\n");
+ Info.Valid = false;
+ return;
+ }
+
+ // Check if this instruction uses or overwrites any tracked DS load
+ // destination. If so, baseline will have inserted a wait that flushes
+ // all loads up to that position (since DS loads complete in order).
+ // Overwrites also require the load to complete first to avoid races.
+ for (auto &[Reg, Position] : TrackedLoads) {
+ if (Position <= LastFlushedPosition)
+ continue; // Already flushed
+
+ if (MI.readsRegister(Reg, TRI) || MI.modifiesRegister(Reg, TRI)) {
+ LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS load at position "
+ << Position << " used/overwritten in same iteration, "
+ << "flushing positions 1-" << Position << "\n");
+ LastFlushedPosition = std::max(LastFlushedPosition, Position);
+ }
+ }
+
// Check DS instructions
if (SIInstrInfo::isDS(MI)) {
// DS stores after barrier not allowed - same counter, may complete
@@ -2806,6 +2859,7 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
for (const MachineOperand &Op : MI.defs()) {
if (Op.isReg() && Op.getReg().isPhysical() &&
TRI->isVGPR(*MRI, Op.getReg())) {
+ TrackedLoads.emplace_back(Op.getReg(), LoadPosition);
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
Info.VGPRToLoadPosition[static_cast<unsigned>(Unit)] =
LoadPosition;
@@ -2816,12 +2870,32 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
}
}
- Info.TotalDSLoads = LoadPosition;
+ // Filter out flushed loads and renumber remaining ones
+ // Also compute the floor wait count - the wait established by same-iteration
+ // use
+ if (LastFlushedPosition > 0) {
+ DenseMap<unsigned, unsigned> NewMap;
+ for (auto &[RegUnit, Position] : Info.VGPRToLoadPosition) {
+ if (Position > LastFlushedPosition) {
+ NewMap[RegUnit] = Position - LastFlushedPosition;
+ }
+ }
+ Info.VGPRToLoadPosition = std::move(NewMap);
+ // FloorWaitCount: when same-iteration use waits for load N, it leaves
+ // (TotalLoads - N) loads in flight. For the next iteration's WMMAs,
+ // any that only use flushed loads are already covered by this wait.
+ Info.FloorWaitCount = LoadPosition - LastFlushedPosition;
+ } else {
+ Info.FloorWaitCount = 0;
+ }
+
+ Info.TotalDSLoads = LoadPosition - LastFlushedPosition;
Info.Valid = Info.TotalDSLoads > 0;
LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at ";
MBB->printName(dbgs());
- dbgs() << " - " << Info.TotalDSLoads << " DS loads"
+ dbgs() << " - " << Info.TotalDSLoads << " DS loads, "
+ << "FloorWaitCount=" << Info.FloorWaitCount
<< ", HasBarrier=" << (LastBarrier != nullptr)
<< ", Valid=" << Info.Valid << "\n");
}
@@ -2851,13 +2925,26 @@ SIInsertWaitcnts::getOptimalDSWaitCount(MachineBasicBlock *LoopHeader,
}
}
- if (MaxLoadPosition == 0)
- return std::nullopt;
-
// Optimal wait = TotalDSLoads - MaxLoadPosition
// This means we wait until all loads up to and including MaxLoadPosition
// have completed, but loads after it can still be in flight.
- return Info.TotalDSLoads - MaxLoadPosition;
+ unsigned OptimalWait = Info.TotalDSLoads - MaxLoadPosition;
+
+ // If MaxLoadPosition == 0, this instruction only uses flushed loads
+ // (whose results are used in the same iteration). The same-iteration use
+ // will insert a wait that leaves FloorWaitCount loads in flight.
+ // So this instruction's needs are covered if OptimalWait >= FloorWaitCount.
+ // We return FloorWaitCount to indicate "can relax to this level".
+ if (MaxLoadPosition == 0 && Info.FloorWaitCount > 0) {
+ // All operands are from flushed loads - covered by same-iteration use's
+ // wait
+ return Info.FloorWaitCount;
+ }
+
+ if (MaxLoadPosition == 0)
+ return std::nullopt;
+
+ return OptimalWait;
}
// Try to apply DS loop wait optimization to relax conservative wait counts.
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
index e6237338fda5b..ba0051ab03409 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
@@ -15,7 +15,7 @@
# With opt: S_WAIT_DSCNT 12 (wait for only 4 loads, 12 remain in flight)
#
# DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible
-# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1
+# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, FloorWaitCount=0, HasBarrier=1, Valid=1
# DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for:
# DBG: DS Loop Opt: Inserted DS_CNT flush in preheader bb.0 for loop at bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir
new file mode 100644
index 0000000000000..3a4dd9924edaa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir
@@ -0,0 +1,109 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s
+
+# Test: preheader loads with slight reordering
+# Baseline is close to optimal, improvement below threshold of 4
+# Both OPT and NOOPT should produce the same wait count.
+
+--- |
+ define amdgpu_kernel void @ds_loop_no_improvement() { ret void }
+...
+
+---
+# OPT-LABEL: name: ds_loop_no_improvement
+# NOOPT-LABEL: name: ds_loop_no_improvement
+name: ds_loop_no_improvement
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ waveLimiter: false
+body: |
+ ; Check preheader: neither OPT nor NOOPT adds flush (optimization doesn't apply)
+ ; OPT: bb.0:
+ ; OPT-NOT: S_WAIT_DSCNT
+ ; OPT: S_BRANCH %bb.1
+
+ ; NOOPT: bb.0:
+ ; NOOPT-NOT: S_WAIT_DSCNT
+ ; NOOPT: S_BRANCH %bb.1
+
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads with slight reordering (Pulled up Loads #5 and #6)
+ ; This creates a small mismatch with loop body order, so baseline is
+ ; close to optimal but not perfect. Improvement should be below threshold.
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ ; Preheader has loads #5,#6 first, then #1-4, then rest.
+ ; First WMMA uses loads #1-4 (vgpr10-25), which are at positions 3-6 in preheader.
+ ; Baseline produces S_WAIT_DSCNT 10 (wait for first 6 loads to complete).
+ ; Optimal would be 12 (only need first 4 loads in loop body order).
+ ; Improvement = 12 - 10 = 2, which is below threshold of 4, so no relaxation.
+ ; OPT: bb.1:
+ ; OPT: S_WAIT_DSCNT 10
+ ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ ; NOOPT: bb.1:
+ ; NOOPT: S_WAIT_DSCNT 10
+ ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+
+ ; First WMMA uses vgpr10-25 (loads #1-4)
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ S_BARRIER
+
+ ; Prefetch DS loads for next iteration (FORWARD order)
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir
new file mode 100644
index 0000000000000..f2a38ab7f3d8e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir
@@ -0,0 +1,111 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s
+
+# Test: same-iteration OVERWRITE of a DS load result (load #4 = vgpr22-25)
+# This flushes loads #1-4, leaving loads #5-16 (12 loads) as prefetch.
+# Tests that the analysis correctly handles same-iteration overwrites.
+
+--- |
+ define amdgpu_kernel void @ds_loop_same_iter_overwrite() { ret void }
+...
+
+---
+# OPT-LABEL: name: ds_loop_same_iter_overwrite
+# NOOPT-LABEL: name: ds_loop_same_iter_overwrite
+name: ds_loop_same_iter_overwrite
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ waveLimiter: false
+body: |
+ ; OPT: bb.0:
+ ; OPT: S_WAIT_DSCNT_soft 0
+ ; OPT-NEXT: S_BRANCH %bb.1
+
+ ; NOOPT: bb.0:
+ ; NOOPT-NOT: S_WAIT_DSCNT
+ ; NOOPT: S_BRANCH %bb.1
+
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads in REVERSE order
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ ; First WMMA uses flushed loads #1-4, gets FloorWaitCount = 12
+ ; Second WMMA uses loads #5-8 (positions 1-4), gets 12-4 = 8
+ ; OPT: bb.1:
+ ; OPT: S_WAIT_DSCNT 12
+ ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+ ; OPT: S_WAIT_DSCNT 8
+ ; OPT-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA
+
+ ; NOOPT: bb.1:
+ ; NOOPT: S_WAIT_DSCNT 0
+ ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+
+ ; First WMMA uses vgpr10-25 (loads #1-4). Loads #1-4 are flushed by overwrite.
+ ; Remaining loads #5-16 (12 loads), WMMA uses loads #5-8 (positions 1-4)
+ ; OptWait = 12 - 4 = 8
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ S_BARRIER
+
+ ; Prefetch DS loads
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ ; Same-iteration OVERWRITE of load #4 result - flushes loads #1-4
+ ; (Baseline must wait for #4 to complete before writing to same reg)
+ $vgpr22 = V_ADD_U32_e32 0, $vgpr0, implicit $exec
+
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-use.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-use.mir
new file mode 100644
index 0000000000000..524a3bbfda9e5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-use.mir
@@ -0,0 +1,107 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s
+
+# Test: same-iteration USE of a DS load result (load #2 = vgpr14-17)
+# This flushes loads #1-2, leaving loads #3-16 (14 loads) as prefetch.
+# Tests that the analysis correctly handles same-iteration uses.
+
+--- |
+ define amdgpu_kernel void @ds_loop_same_iter_use() { ret void }
+...
+
+---
+# OPT-LABEL: name: ds_loop_same_iter_use
+# NOOPT-LABEL: name: ds_loop_same_iter_use
+name: ds_loop_same_iter_use
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ waveLimiter: false
+body: |
+ ; OPT: bb.0:
+ ; OPT: S_WAIT_DSCNT_soft 0
+ ; OPT-NEXT: S_BRANCH %bb.1
+
+ ; NOOPT: bb.0:
+ ; NOOPT-NOT: S_WAIT_DSCNT
+ ; NOOPT: S_BRANCH %bb.1
+
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads in REVERSE order
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ ; OPT: bb.1:
+ ; OPT: S_WAIT_DSCNT 12
+ ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ ; NOOPT: bb.1:
+ ; NOOPT: S_WAIT_DSCNT 0
+ ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr100
+
+ ; First WMMA uses vgpr10-25 (loads #1-4). Loads #1-2 are flushed by same-iter use.
+ ; Only loads #3-4 (now positions 1-2) are tracked. OptWait = 14 - 2 = 12
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ S_BARRIER
+
+ ; Prefetch DS loads
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ ; Same-iteration USE of load #2 result - flushes loads #1-2
+ ; Remaining loads #3-16 (14 loads), WMMA uses loads #3-4 (positions 1-2)
+ ; OptWait = 14 - 2 = 12
+ $vgpr100 = V_MOV_B32_e32 $vgpr14, implicit $exec
+
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-tensor-load.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-tensor-load.mir
new file mode 100644
index 0000000000000..83c8f51474f57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-tensor-load.mir
@@ -0,0 +1,97 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -debug-only=si-insert-waitcnts -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=DBG %s
+
+# REQUIRES: asserts
+
+# Test: tensor_load_to_lds instruction after barrier should cause bailout
+# The optimization should not be applied when there are tensor_load_to_lds
+# instructions after the last barrier, as they also write to LDS.
+
+# DBG: Loop DS Wait Opt: tensor_load_to_lds after last barrier, skipping
+
+--- |
+ define amdgpu_kernel void @ds_loop_tensor_load() { ret void }
+...
+
+---
+name: ds_loop_tensor_load
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ waveLimiter: false
+body: |
+ ; OPT-LABEL: name: ds_loop_tensor_load
+
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22_sgpr23
+
+ ; Preheader: DS loads in REVERSE order
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ ; tensor_load_to_lds after barrier causes optimization to bail out.
+ ; Both OPT and NOOPT produce the same conservative wait.
+ ; OPT: bb.1:
+ ; OPT: S_WAIT_DSCNT 0
+ ; OPT: early-clobber $vgpr80{{.*}} = V_WMMA
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22_sgpr23, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+
+ ; First WMMA uses vgpr10-25 (loads #1-4)
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ S_BARRIER
+
+ ; TENSOR_LOAD_TO_LDS after barrier - causes optimization to bail out
+ TENSOR_LOAD_TO_LDS $sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22_sgpr23, 0, 0, implicit $exec, implicit $tensorcnt, implicit-def $tensorcnt
+
+ ; Prefetch DS loads
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
More information about the llvm-branch-commits
mailing list