[llvm] [AMDGPU] Add DS loop waitcnt optimization for GFX12+ (PR #172728)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 17 14:53:37 PST 2025
https://github.com/hidekisaito updated https://github.com/llvm/llvm-project/pull/172728
>From 1b0e536c4c8705b8022714224b7c2392aa9d6249 Mon Sep 17 00:00:00 2001
From: Hideki Saito <hidekido at amd.com>
Date: Wed, 17 Dec 2025 15:09:15 -0500
Subject: [PATCH] [AMDGPU] Add DS loop waitcnt optimization for GFX12+
Add support for flushing DS_CNT in loop preheaders when the loop uses values that were DS-loaded outside the loop. This is similar to the existing VMEM loop optimization.
Assisted-by: Cursor / claude-4.5-opus-high
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 224 +++++++++++++-----
llvm/test/CodeGen/AMDGPU/waitcnt-loop-opt.mir | 159 +++++++++++++
2 files changed, 329 insertions(+), 54 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-loop-opt.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e21583ae0876f..6b0bcfc6b4dcf 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -419,6 +419,12 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
};
+// Flags indicating which counters should be flushed in a loop preheader.
+struct PreheaderFlushFlags {
+ bool FlushVmCnt = false;
+ bool FlushDsCnt = false;
+};
+
class SIInsertWaitcnts {
public:
const GCNSubtarget *ST;
@@ -431,7 +437,7 @@ class SIInsertWaitcnts {
private:
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
- DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+ DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
AliasAnalysis *AA = nullptr;
@@ -492,10 +498,13 @@ class SIInsertWaitcnts {
return 0;
}
- bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
- bool isPreheaderToFlush(MachineBasicBlock &MBB,
- const WaitcntBrackets &ScoreBrackets);
+ PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
+ const WaitcntBrackets &Brackets);
+ PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
+ const WaitcntBrackets &ScoreBrackets);
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+ bool isDSRead(const MachineInstr &MI) const;
+ bool mayStoreLDS(const MachineInstr &MI) const;
bool run(MachineFunction &MF);
void setForceEmitWaitcnt() {
@@ -570,7 +579,7 @@ class SIInsertWaitcnts {
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt);
+ PreheaderFlushFlags FlushFlags);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1932,12 +1941,12 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
-/// flush the vmcnt counter here.
-bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr,
- bool FlushVmCnt) {
+/// If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
+/// If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
+/// (GFX12+ only, where DS_CNT is a separate counter).
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+ MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
setForceEmitWaitcnt();
assert(!MI.isMetaInstruction());
@@ -2201,7 +2210,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (ForceEmitWaitcnt[X_CNT])
Wait.XCnt = 0;
- if (FlushVmCnt) {
+ if (FlushFlags.FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
Wait.LoadCnt = 0;
if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
@@ -2210,6 +2219,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.BvhCnt = 0;
}
+ if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+ Wait.DsCnt = 0;
+
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
Wait.LoadCnt = 0;
@@ -2579,12 +2591,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
- bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
- isPreheaderToFlush(Block, ScoreBrackets);
+ PreheaderFlushFlags FlushFlags;
+ if (Block.getFirstTerminator() == Inst)
+ FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
- FlushVmCnt);
+ FlushFlags);
OldWaitcntInstr = nullptr;
// Restore vccz if it's not known to be correct already.
@@ -2658,17 +2671,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
}
- // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
- // needed.
+ // Flush counters at the end of the block if needed (for preheaders with no
+ // terminator).
AMDGPU::Waitcnt Wait;
- if (Block.getFirstTerminator() == Block.end() &&
- isPreheaderToFlush(Block, ScoreBrackets)) {
- if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
- Wait.LoadCnt = 0;
- if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
- Wait.SampleCnt = 0;
- if (ScoreBrackets.hasPendingEvent(BVH_CNT))
- Wait.BvhCnt = 0;
+ if (Block.getFirstTerminator() == Block.end()) {
+ PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+ if (FlushFlags.FlushVmCnt) {
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
+ }
+ if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+ Wait.DsCnt = 0;
}
// Combine or remove any redundant waitcnts at the end of the block.
@@ -2684,29 +2701,28 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
return Modified;
}
-// Return true if the given machine basic block is a preheader of a loop in
-// which we want to flush the vmcnt counter, and false otherwise.
-bool SIInsertWaitcnts::isPreheaderToFlush(
+// Return flags indicating which counters should be flushed in the preheader.
+PreheaderFlushFlags SIInsertWaitcnts::isPreheaderToFlush(
MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
- auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+ auto [Iterator, IsInserted] =
+ PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
if (!IsInserted)
return Iterator->second;
MachineBasicBlock *Succ = MBB.getSingleSuccessor();
if (!Succ)
- return false;
+ return PreheaderFlushFlags();
MachineLoop *Loop = MLI->getLoopFor(Succ);
if (!Loop)
- return false;
+ return PreheaderFlushFlags();
- if (Loop->getLoopPreheader() == &MBB &&
- shouldFlushVmCnt(Loop, ScoreBrackets)) {
- Iterator->second = true;
- return true;
+ if (Loop->getLoopPreheader() == &MBB) {
+ Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
+ return Iterator->second;
}
- return false;
+ return PreheaderFlushFlags();
}
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
@@ -2715,38 +2731,90 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
return SIInstrInfo::isVMEM(MI);
}
-// Return true if it is better to flush the vmcnt counter in the preheader of
-// the given loop. We currently decide to flush in two situations:
+bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
+ return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
+}
+
+// Check if instruction may store to LDS (including DS stores, atomics,
+// FLAT instructions that may access LDS, LDS DMA, and tensor load to LDS).
+bool SIInsertWaitcnts::mayStoreLDS(const MachineInstr &MI) const {
+ if (SIInstrInfo::mayWriteLDSThroughDMA(MI))
+ return true;
+ unsigned Opc = MI.getOpcode();
+ if (Opc == AMDGPU::TENSOR_LOAD_TO_LDS || Opc == AMDGPU::TENSOR_LOAD_TO_LDS_D2)
+ return true;
+ if (!MI.mayStore())
+ return false;
+ if (SIInstrInfo::isDS(MI))
+ return true;
+ if (SIInstrInfo::isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI))
+ return true;
+ return false;
+}
+
+// Return flags indicating which counters should be flushed in the preheader of
+// the given loop. We currently decide to flush in a few situations:
+// For VMEM (FlushVmCnt):
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
// vgpr containing a value that is loaded outside of the loop. (Only on
// targets with no vscnt counter).
// 2. The loop contains vmem load(s), but the loaded values are not used in the
// loop, and at least one use of a vgpr containing a value that is loaded
// outside of the loop.
-bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
- const WaitcntBrackets &Brackets) {
+// For DS (FlushDsCnt, GFX12+ only):
+// 3. The loop contains no DS reads, and at least one use of a vgpr containing
+// a value that is DS loaded outside of the loop.
+// 4. The loop contains DS read(s), loaded values are not used in the same
+// iteration but in the next iteration (prefetch pattern), and at least one
+// use of a vgpr containing a value that is DS loaded outside of the loop.
+// Flushing in preheader reduces wait overhead if the wait requirement in
+// iteration 1 would otherwise be more strict.
+PreheaderFlushFlags
+SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
+ const WaitcntBrackets &Brackets) {
+ PreheaderFlushFlags Flags;
bool HasVMemLoad = false;
bool HasVMemStore = false;
- bool UsesVgprLoadedOutside = false;
+ bool SeenDSStoreInLoop = false;
+ bool UsesVgprLoadedOutsideVMEM = false;
+ bool UsesVgprLoadedOutsideDS = false;
+ bool VMemInvalidated = false;
+ // DS optimization only applies to GFX12+ where DS_CNT is separate.
+ bool DSInvalidated = !ST->hasExtendedWaitCounts();
DenseSet<MCRegUnit> VgprUse;
- DenseSet<MCRegUnit> VgprDef;
+ DenseSet<MCRegUnit> VgprDefVMEM;
+ DenseSet<MCRegUnit> VgprDefDS;
for (MachineBasicBlock *MBB : ML->blocks()) {
+ bool SeenDSStoreInCurrMBB = false;
for (MachineInstr &MI : *MBB) {
if (isVMEMOrFlatVMEM(MI)) {
HasVMemLoad |= MI.mayLoad();
HasVMemStore |= MI.mayStore();
}
-
+ if (mayStoreLDS(MI))
+ SeenDSStoreInCurrMBB = true;
+ // Barrier in this MBB can only reset stores seen in this MBB
+ if (MI.getOpcode() == AMDGPU::S_BARRIER)
+ SeenDSStoreInCurrMBB = false;
for (const MachineOperand &Op : MI.all_uses()) {
if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
// Vgpr use
for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
// If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
- if (VgprDef.contains(RU))
- return false;
+ // are invalidated.
+ if (VgprDefVMEM.contains(RU))
+ VMemInvalidated = true;
+
+ // Check for DS loads used inside the loop
+ if (VgprDefDS.contains(RU))
+ DSInvalidated = true;
+
+ // Early exit if both optimizations are invalidated
+ if (VMemInvalidated && DSInvalidated)
+ return Flags;
+
VgprUse.insert(RU);
// If at least one of Op's registers is in the score brackets, the
// value is likely loaded outside of the loop.
@@ -2757,8 +2825,20 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
Brackets.getScoreLB(SAMPLE_CNT) ||
Brackets.getVMemScore(ID, BVH_CNT) >
Brackets.getScoreLB(BVH_CNT)) {
- UsesVgprLoadedOutside = true;
- break;
+ UsesVgprLoadedOutsideVMEM = true;
+ }
+ // Check if loaded outside the loop via DS (not VMEM/FLAT)
+ // Only consider it a DS load if there's no pending VMEM load for
+ // this register, since FLAT can set both counters.
+ bool HasPendingVMEM = Brackets.getVMemScore(ID, LOAD_CNT) >
+ Brackets.getScoreLB(LOAD_CNT) ||
+ Brackets.getVMemScore(ID, SAMPLE_CNT) >
+ Brackets.getScoreLB(SAMPLE_CNT) ||
+ Brackets.getVMemScore(ID, BVH_CNT) >
+ Brackets.getScoreLB(BVH_CNT);
+ if (!HasPendingVMEM && Brackets.getVMemScore(ID, DS_CNT) >
+ Brackets.getScoreLB(DS_CNT)) {
+ UsesVgprLoadedOutsideDS = true;
}
}
}
@@ -2768,18 +2848,54 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
for (const MachineOperand &Op : MI.all_defs()) {
for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
// If we find a register that is loaded inside the loop, 1. and 2.
- // are invalidated and we can exit.
+ // are invalidated.
if (VgprUse.contains(RU))
- return false;
- VgprDef.insert(RU);
+ VMemInvalidated = true;
+ VgprDefVMEM.insert(RU);
+ }
+ }
+ // Early exit if both optimizations are invalidated
+ if (VMemInvalidated && DSInvalidated)
+ return Flags;
+ }
+
+ // DS read vgpr def
+ // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
+ // If USE comes before DEF, it's the prefetch pattern (use value from
+ // previous iteration, load for next iteration). We should still flush
+ // in preheader so iteration 1 doesn't need to wait inside the loop.
+ // Only invalidate when DEF comes before USE (same-iteration consumption,
+ // checked above when processing uses).
+ if (isDSRead(MI)) {
+ for (const MachineOperand &Op : MI.all_defs()) {
+ for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+ VgprDefDS.insert(RU);
}
}
}
}
+ // Accumulate unprotected DS stores from this MBB
+ SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
}
- if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
- return true;
- return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
+
+ // VMEM flush decision
+ if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
+ ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
+ (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
+ Flags.FlushVmCnt = true;
+
+ // DS flush decision: flush if loop uses DS-loaded values from outside
+ // and either has no DS reads in the loop, or DS reads whose results
+ // are not used in the loop.
+ // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
+ // is LGKM_CNT which also tracks FLAT/SMEM.
+ // DS stores share DS_CNT with DS reads, but stores before a barrier are OK
+ // since the barrier ensures completion. Only disable if there are unprotected
+ // DS stores (not followed by a barrier in the same MBB).
+ if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
+ Flags.FlushDsCnt = true;
+
+ return Flags;
}
bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-opt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-opt.mir
new file mode 100644
index 0000000000000..a8219cf0e17f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-opt.mir
@@ -0,0 +1,159 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
+
+# Test 1: Simple case - DS loads only in preheader, no DS loads in loop.
+# The optimization flushes DSCNT in the preheader so that
+# subsequent loop iterations don't need to wait inside the loop.
+
+# CHECK-LABEL: name: ds_preheader_flush_simple
+# CHECK: bb.0:
+# CHECK: DS_READ_B128
+# CHECK: DS_READ_B128
+# CHECK: DS_READ_B128
+# CHECK: DS_READ_B128
+# CHECK: S_WAIT_DSCNT 0
+# CHECK-NEXT: S_BRANCH %bb.1
+# CHECK: bb.1:
+# CHECK-NOT: S_WAIT_DSCNT
+# CHECK: $vgpr30 = V_ADD_F32
+# CHECK-NOT: S_WAIT_DSCNT
+# CHECK: $vgpr31 = V_ADD_F32
+
+--- |
+ define amdgpu_kernel void @ds_preheader_flush_simple() { ret void }
+ define amdgpu_kernel void @ds_loop_prefetch_pattern() { ret void }
+...
+
+---
+name: ds_preheader_flush_simple
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10, $vgpr14, $vgpr18, $vgpr22
+
+ ; Use DS-loaded values (no DS loads inside the loop)
+ $vgpr30 = V_ADD_F32_e32 $vgpr10, $vgpr14, implicit $mode, implicit $exec
+ $vgpr31 = V_ADD_F32_e32 $vgpr18, $vgpr22, implicit $mode, implicit $exec
+
+ ; Loop control
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+# Test 2: Prefetch pattern - DS loads both in preheader AND at end of loop.
+# USE comes before DEF (prefetch pattern): values are used first, then
+# new values are loaded for the next iteration. Since the DEF happens
+# AFTER the USE in program order, we can still flush in preheader.
+# This helps iteration 1 avoid waiting inside the loop.
+#
+# The cascading wait pattern shows decreasing dscnt values (12, 8, 4, 0)
+# as each group of registers becomes ready.
+#
+# CHECK-LABEL: name: ds_loop_prefetch_pattern
+# CHECK: bb.0:
+# CHECK: DS_READ_B128
+# CHECK: S_WAIT_DSCNT 0
+# CHECK-NEXT: S_BRANCH %bb.1
+# CHECK: bb.1:
+# CHECK: S_WAIT_DSCNT 12
+# CHECK-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+# CHECK: S_WAIT_DSCNT 8
+# CHECK-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA
+# CHECK: S_WAIT_DSCNT 4
+# CHECK-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA
+# CHECK: S_WAIT_DSCNT 0
+# CHECK-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA
+
+name: ds_loop_prefetch_pattern
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr0, $vgpr0
+
+ ; Preheader: DS loads for first iteration
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+
+ ; WMMA using vgpr10-25 (loaded in preheader or previous iteration's prefetch)
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+
+ ; More WMMAs using other registers
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ ; Repeat with same registers
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec
+ early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec
+
+ ; Barrier
+ S_BARRIER
+
+ ; Prefetch DS loads for next iteration (this prevents simple optimization)
+ $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec
+ $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec
+ $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec
+ $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec
+ $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec
+ $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec
+ $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec
+ $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec
+ $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec
+ $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec
+ $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec
+ $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec
+ $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec
+ $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec
+ $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec
+
+ ; Loop control
+ $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list