[llvm] [AMDGPU] Reschedule loads in clauses to improve throughput (PR #102595)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 02:20:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Carl Ritson (perlfu)
<details>
<summary>Changes</summary>
After clauses are formed their internal loads can be reordered to facilitate some additional opportunities for overlapping computation.
This late stage rescheduling causes no change register pressure.
---
Patch is 1.82 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102595.diff
70 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIPostRABundler.cpp (+145-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+20-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+30-23)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll (+57-46)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll (+39-33)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll (+32-25)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2913-2748)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+21-14)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+218-207)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+1056-863)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+71-70)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+95-94)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+118-112)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+174-165)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+218-197)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+205-197)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+394-329)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+42-10)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+10-11)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+15-11)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+15-11)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+22-20)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+411-268)
- (modified) llvm/test/CodeGen/AMDGPU/ds-alignment.ll (+47-26)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+22-20)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+136-113)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+115-115)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+56-55)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+142-91)
- (modified) llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll (+6-5)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+64-63)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+64-63)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+50-39)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-f32.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+43-25)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+30-23)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+5-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll (+13-10)
- (modified) llvm/test/CodeGen/AMDGPU/load-local.128.ll (+22-17)
- (modified) llvm/test/CodeGen/AMDGPU/load-local.96.ll (+17-13)
- (modified) llvm/test/CodeGen/AMDGPU/max.i16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll (+16-14)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+238-196)
- (modified) llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll (+120-96)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll (+58-46)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+14-10)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+6-5)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+236-158)
- (added) llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir (+198)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+13-12)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+19-15)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 5720b978aada0..80cca7bcfde9c 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include <deque>
using namespace llvm;
@@ -50,6 +51,7 @@ class SIPostRABundler {
bool run(MachineFunction &MF);
private:
+ const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI;
SmallSet<Register, 16> Defs;
@@ -60,6 +62,9 @@ class SIPostRABundler {
bool isBundleCandidate(const MachineInstr &MI) const;
bool isDependentLoad(const MachineInstr &MI) const;
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
+ void reorderLoads(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &BundleStart,
+ MachineBasicBlock::instr_iterator Next);
};
constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
@@ -129,6 +134,141 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
!isDependentLoad(NextMI));
}
+static Register getDef(MachineInstr &MI) {
+ assert(MI.getNumExplicitDefs() > 0);
+ return MI.defs().begin()->getReg();
+}
+
+void SIPostRABundler::reorderLoads(
+ MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
+ MachineBasicBlock::instr_iterator Next) {
+ // Don't reorder ALU, store or scalar clauses.
+ if (!BundleStart->mayLoad() || BundleStart->mayStore() ||
+ SIInstrInfo::isSMRD(*BundleStart) || !BundleStart->getNumExplicitDefs())
+ return;
+
+ // Search to find the usage distance of each defined register in the clause.
+ const unsigned SearchDistance = std::max(Defs.size(), 100UL);
+ SmallDenseMap<Register, unsigned> UseDistance;
+ unsigned MaxDistance = 0;
+ for (MachineBasicBlock::iterator SearchI = Next;
+ SearchI != MBB.end() && MaxDistance < SearchDistance &&
+ UseDistance.size() < Defs.size();
+ ++SearchI, ++MaxDistance) {
+ for (Register Reg : Defs) {
+ if (UseDistance.contains(Reg))
+ continue;
+ if (SearchI->readsRegister(Reg, TRI))
+ UseDistance[Reg] = MaxDistance;
+ }
+ }
+
+ if (UseDistance.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Try bundle reordering\n");
+
+ // Build schedule based on use distance of register uses.
+ // Attempt to preserve exist order (NativeOrder) where possible.
+ std::deque<std::pair<MachineInstr *, unsigned>> Schedule;
+ unsigned NativeOrder = 0, LastOrder = 0;
+ bool Reordered = false;
+ for (auto II = BundleStart; II != Next; ++II, ++NativeOrder) {
+ // Bail out if we encounter anything that seems risky to reorder.
+ if (!II->getNumExplicitDefs() || II->isKill() ||
+ llvm::any_of(II->memoperands(), [&](const MachineMemOperand *MMO) {
+ return MMO->isAtomic() || MMO->isVolatile();
+ })) {
+ LLVM_DEBUG(dbgs() << " Abort\n");
+ return;
+ }
+
+ Register Reg = getDef(*II);
+ unsigned NewOrder =
+ UseDistance.contains(Reg) ? UseDistance[Reg] : MaxDistance;
+ LLVM_DEBUG(dbgs() << " Order: " << NewOrder << "," << NativeOrder
+ << ", MI: " << *II);
+ unsigned Order = (NewOrder << 16 | NativeOrder);
+ Schedule.emplace_back(&*II, Order);
+ Reordered |= Order < LastOrder;
+ LastOrder = Order;
+ }
+
+ // No reordering found.
+ if (!Reordered) {
+ LLVM_DEBUG(dbgs() << " No changes\n");
+ return;
+ }
+
+ // Apply sort on new ordering.
+ std::sort(Schedule.begin(), Schedule.end(),
+ [](std::pair<MachineInstr *, unsigned> A,
+ std::pair<MachineInstr *, unsigned> B) {
+ return A.second < B.second;
+ });
+
+ // Rebuild clause order.
+ // Schedule holds ideal order for the load operations; however, each def
+ // can only be scheduled when it will no longer clobber any uses.
+ SmallVector<MachineInstr *> Clause;
+ while (!Schedule.empty()) {
+ // Try to schedule next instruction in schedule.
+ // Iterate until we find something that can be placed.
+ auto It = Schedule.begin();
+ while (It != Schedule.end()) {
+ MachineInstr *MI = It->first;
+ LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);
+
+ if (MI->getNumExplicitDefs() == 0) {
+ // No defs, always schedule.
+ LLVM_DEBUG(dbgs() << " Trivially OK\n");
+ break;
+ }
+
+ Register DefReg = getDef(*MI);
+ bool DefRegHasUse = false;
+ for (auto SearchIt = std::next(It);
+ SearchIt != Schedule.end() && !DefRegHasUse; ++SearchIt)
+ DefRegHasUse = SearchIt->first->readsRegister(DefReg, TRI);
+ if (DefRegHasUse) {
+ // A future use would be clobbered; try next instruction in the
+ // schedule.
+ LLVM_DEBUG(dbgs() << " Clobbers uses\n");
+ It++;
+ continue;
+ }
+
+ // Safe to schedule.
+ LLVM_DEBUG(dbgs() << " OK!\n");
+ break;
+ }
+
+ // Place schedule instruction into clause order.
+ assert(It != Schedule.end());
+ MachineInstr *MI = It->first;
+ Schedule.erase(It);
+ Clause.push_back(MI);
+
+ // Clear kill flags for later uses.
+ for (auto &Use : MI->all_uses()) {
+ if (!Use.isReg() || !Use.isKill())
+ continue;
+ Register UseReg = Use.getReg();
+ if (llvm::any_of(Schedule, [&](std::pair<MachineInstr *, unsigned> &SI) {
+ return SI.first->readsRegister(UseReg, TRI);
+ }))
+ Use.setIsKill(false);
+ }
+ }
+
+ // Apply order to instructions.
+ for (MachineInstr *MI : Clause)
+ MI->moveBefore(&*Next);
+
+ // Update start of bundle.
+ BundleStart = Clause[0]->getIterator();
+}
+
bool SIPostRABundlerLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -143,6 +283,8 @@ PreservedAnalyses SIPostRABundlerPass::run(MachineFunction &MF,
bool SIPostRABundler::run(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
BitVector KillUsedRegUnits(TRI->getNumRegUnits());
@@ -170,7 +312,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
assert(Defs.empty());
if (I->getNumExplicitDefs() != 0)
- Defs.insert(I->defs().begin()->getReg());
+ Defs.insert(getDef(*I));
MachineBasicBlock::instr_iterator BundleStart = I;
MachineBasicBlock::instr_iterator BundleEnd = I;
@@ -182,7 +324,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
if (canBundle(*BundleEnd, *I)) {
BundleEnd = I;
if (I->getNumExplicitDefs() != 0)
- Defs.insert(I->defs().begin()->getReg());
+ Defs.insert(getDef(*I));
++ClauseLength;
} else if (!I->isMetaInstruction() ||
I->getOpcode() == AMDGPU::SCHED_BARRIER) {
@@ -234,6 +376,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
BundleUsedRegUnits.reset();
}
+ reorderLoads(MBB, BundleStart, Next);
finalizeBundle(MBB, BundleStart, Next);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..c04f86391c44b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -716,17 +716,17 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX9-LABEL: add_v11i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20
; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
-; GFX9-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 6ea0a9446ff9d..7fca4d628d023 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -750,20 +750,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-CONTRACT-NEXT: s_clause 0x8
-; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(7)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -777,20 +777,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x8
-; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(7)
; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3)
; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -804,20 +804,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX11-CONTRACT: ; %bb.0: ; %.entry
; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-CONTRACT-NEXT: s_clause 0x8
-; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20
; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28
; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(7)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -833,20 +833,20 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX11-DENORM: ; %bb.0: ; %.entry
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x8
-; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32
; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-DENORM-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX11-DENORM-NEXT: scratch_load_b32 v36, off, s32 offset:20
; GFX11-DENORM-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32
; GFX11-DENORM-NEXT: scratch_load_b32 v38, off, s32 offset:28
; GFX11-DENORM-NEXT: scratch_load_b32 v39, off, s32 offset:32
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(6)
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(7)
; GFX11-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(4)
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX11-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(2)
+; GFX11-DENORM-NEXT: s_waitcnt vmcnt(3)
; GFX11-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -921,20 +921,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-CONTRACT-NEXT: s_clause 0x8
-; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(7)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -948,20 +948,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x8
-; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(7)
; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX10-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2)
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(3)
; GFX10-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -975,20 +975,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX11-CONTRACT: ; %bb.0: ; %.entry
; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-CONTRACT-NEXT: s_clause 0x8
-; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
; GFX11-CONTRACT-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-CONTRACT-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-CONTRACT-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-CONTRACT-NEXT: scratch_load_b32 v35, off, s32 offset:16
; GFX11-CONTRACT-NEXT: scratch_load_b32 v36, off, s32 offset:20
; GFX11-CONTRACT-NEXT: scratch_load_b32 v37, off, s32 offset:24
+; GFX11-CONTRACT-NEXT: scratch_load_b32 v31, off, s32
; GFX11-CONTRACT-NEXT: scratch_load_b32 v38, off, s32 offset:28
; GFX11-CONTRACT-NEXT: scratch_load_b32 v39, off, s32 offset:32
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(6)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(7)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(4)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
-; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(2)
+; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
; GFX11-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
@@ -1004,20 +1004,20 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX11-DENORM: ; %bb.0: ; %.entry
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x8
-; GFX11-DENORM-NEXT: scratch_load_b32 v31, off, s32
; GFX11-DENORM-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-DENORM-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-DENORM-NEXT: scratch_load_b32 v34, off, s32 offse...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102595
More information about the llvm-commits
mailing list