[llvm] [AMDGPU] Reschedule loads in clauses to improve throughput (RFC) (PR #102595)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 9 03:18:07 PDT 2024
https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/102595
After clauses are formed their internal loads can be reordered to facilitate some additional opportunities for overlapping computation.
This late stage rescheduling causes no change register pressure.
>From 4169b41b16ce3f07bcd2c7fc9bd6c02bdc0ab2e9 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Fri, 9 Aug 2024 18:18:41 +0900
Subject: [PATCH] [AMDGPU] Reschedule loads in clauses to improve throughput
After clauses are formed their internal loads can be reordered
to facilitate some additional opportunities for overlapping
computation.
This late stage rescheduling causes no change register pressure.
---
llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 127 +++++++++++
...e92561-restore-undef-scc-verifier-error.ll | 7 +-
.../AMDGPU/reschedule-bundle-loads.mir | 198 ++++++++++++++++++
3 files changed, 329 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 8464cb3d6fc43d..969b85fb2c169e 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -44,6 +44,7 @@ class SIPostRABundler : public MachineFunctionPass {
}
private:
+ const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI;
SmallSet<Register, 16> Defs;
@@ -54,6 +55,9 @@ class SIPostRABundler : public MachineFunctionPass {
bool isBundleCandidate(const MachineInstr &MI) const;
bool isDependentLoad(const MachineInstr &MI) const;
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
+ void reorderLoads(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &BundleStart,
+ MachineBasicBlock::instr_iterator Next);
};
constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
@@ -121,10 +125,132 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
!isDependentLoad(NextMI));
}
+void SIPostRABundler::reorderLoads(
+ MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
+ MachineBasicBlock::instr_iterator Next) {
+ auto II = BundleStart;
+ if (!TII->isMIMG(II->getOpcode()) || II->mayStore())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Begin bundle reorder\n");
+
+ // Collect clause
+ SmallVector<MachineInstr *> Clause;
+ for (auto II = BundleStart; II != Next; ++II)
+ Clause.push_back(&*II);
+
+ // Search to find the usage distance of each defined register in the clause.
+ const int MaxSearch = 100;
+ SmallSet<Register, 16> DefRegs(Defs);
+ SmallSet<unsigned, 16> Distances;
+ DenseMap<Register, unsigned> UseDistance;
+ unsigned Dist = 0;
+ for (MachineBasicBlock::iterator SearchI = Next;
+ SearchI != MBB.end() && Dist < MaxSearch && !DefRegs.empty();
+ ++SearchI, ++Dist) {
+ SmallVector<Register, 4> Found;
+ // FIXME: fix search efficiency
+ for (Register DefReg : DefRegs) {
+ if (SearchI->readsRegister(DefReg, TRI))
+ Found.push_back(DefReg);
+ }
+ for (Register Reg : Found) {
+ UseDistance[Reg] = Dist;
+ DefRegs.erase(Reg);
+ Distances.insert(Dist);
+ }
+ }
+
+ if (Distances.size() <= 1)
+ return;
+
+ std::vector<std::pair<MachineInstr *, unsigned>> Schedule;
+ unsigned TotalOrder = Dist + 1;
+ bool Reorder = false;
+ for (MachineInstr *MI : Clause) {
+ unsigned Order = TotalOrder++;
+ if (MI->getNumExplicitDefs() >= 0) {
+ Register Reg = MI->defs().begin()->getReg();
+ if (!UseDistance.contains(Reg))
+ continue;
+ Order = std::min(Order, UseDistance[Reg]);
+ Reorder = true;
+ }
+ LLVM_DEBUG(dbgs() << "Order: " << Order << ", MI: " << *MI);
+ Schedule.push_back(std::pair(MI, Order));
+ }
+
+ if (!Reorder)
+ return;
+
+ std::sort(Schedule.begin(), Schedule.end(),
+ [](std::pair<MachineInstr *, unsigned> A,
+ std::pair<MachineInstr *, unsigned> B) {
+ return A.second < B.second;
+ });
+
+ // Rebuild clause order.
+ // Schedule holds ideal order for the load operations; however, each def
+ // can only be scheduled when it will no longer clobber any uses.
+ Clause.clear();
+ while (!Schedule.empty()) {
+ auto It = Schedule.begin();
+ while (It != Schedule.end()) {
+ MachineInstr *MI = It->first;
+
+ LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);
+
+ if (MI->getNumExplicitDefs() == 0) {
+ // No defs, always schedule.
+ Clause.push_back(MI);
+ break;
+ }
+
+ // FIXME: make this scan more efficient
+ Register Reg = MI->defs().begin()->getReg();
+ bool ClobbersUse = false;
+ for (auto SearchIt = Schedule.begin(); SearchIt != Schedule.end();
+ ++SearchIt) {
+ // We are allowed to clobber our own uses.
+ if (SearchIt == It)
+ continue;
+ if (SearchIt->first->readsRegister(Reg, TRI)) {
+ ClobbersUse = true;
+ break;
+ }
+ }
+ if (ClobbersUse) {
+ // Use is clobbered; try next def in the schedule.
+ It++;
+ LLVM_DEBUG(dbgs() << " Clobbers uses\n");
+ continue;
+ }
+
+ // Safe to schedule.
+ LLVM_DEBUG(dbgs() << " OK!\n");
+ Clause.push_back(MI);
+ break;
+ }
+ assert(It != Schedule.end());
+ Schedule.erase(It);
+ }
+
+ // Apply order to instructions.
+ for (MachineInstr *MI : Clause)
+ MI->moveBefore(&*Next);
+
+ // FIXME: update kill flags
+
+ // Update start of bundle.
+ BundleStart = Clause[0]->getIterator();
+}
+
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
BitVector KillUsedRegUnits(TRI->getNumRegUnits());
@@ -214,6 +340,7 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
BundleUsedRegUnits.reset();
}
+ reorderLoads(MBB, BundleStart, Next);
finalizeBundle(MBB, BundleStart, Next);
}
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index 0adce2b84aa0d3..8da0949190987f 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -61,15 +61,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: s_mov_b32 s7, s12
; SDAG-NEXT: s_clause 0x2
; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
-; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: v_mov_b32_e32 v4, v1
; SDAG-NEXT: s_waitcnt vmcnt(2)
; SDAG-NEXT: v_add_f32_e32 v0, v9, v0
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; SDAG-NEXT: v_mov_b32_e32 v2, v1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v1
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir
new file mode 100644
index 00000000000000..8001952a0e29c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reschedule-bundle-loads.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -start-before si-post-ra-bundler -o - %s | FileCheck %s
+
+--- |
+ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+ target triple = "amdgcn-amd-amdpal"
+ define dllexport amdgpu_cs void @_amdgpu_cs_main(ptr inreg noundef %userdata2, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+ ; CHECK-LABEL: _amdgpu_cs_main:
+ ; CHECK: ; %bb.0: ; %.entry
+ ; CHECK-NEXT: ; implicit-def: $vgpr11
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: v_cvt_u32_f32_e32 v15, v11
+ ; CHECK-NEXT: ; implicit-def: $vgpr12
+ ; CHECK-NEXT: v_cvt_u32_f32_e32 v28, v12
+ ; CHECK-NEXT: ; implicit-def: $vgpr16
+ ; CHECK-NEXT: ; implicit-def: $vgpr17
+ ; CHECK-NEXT: ; implicit-def: $vgpr19
+ ; CHECK-NEXT: ; implicit-def: $vgpr22
+ ; CHECK-NEXT: ; implicit-def: $vgpr24
+ ; CHECK-NEXT: ; implicit-def: $vgpr25
+ ; CHECK-NEXT: ; implicit-def: $vgpr26
+ ; CHECK-NEXT: ; implicit-def: $vgpr57
+ ; CHECK-NEXT: ; implicit-def: $vgpr58
+ ; CHECK-NEXT: ; implicit-def: $vgpr59
+ ; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+ ; CHECK-NEXT: ; implicit-def: $vgpr6
+ ; CHECK-NEXT: v_lshlrev_b32_e32 v56, 3, v6
+ ; CHECK-NEXT: s_clause 0x1f
+ ; CHECK-NEXT: image_load v9, [v57, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v10, [v57, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v12, [v15, v58, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v36, [v15, v58, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v41, [v57, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v42, [v57, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v48, [v15, v28, v17], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v47, [v15, v28, v19], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v11, [v57, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v51, [v57, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v52, [v15, v58, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v35, [v15, v58, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v53, [v57, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v49, [v57, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v50, [v15, v28, v22], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v54, [v15, v28, v16], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v21, [v57, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v20, v[57:59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v32, [v15, v58, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v31, [v15, v58, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v38, [v57, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v37, [v57, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v43, [v15, v28, v59], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v44, [v15, v28, v24], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v30, [v57, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v29, [v57, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v34, [v15, v58, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v33, [v15, v58, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v40, [v57, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v39, [v57, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v45, [v15, v28, v25], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: image_load v46, [v15, v28, v26], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+ ; CHECK-NEXT: ; implicit-def: $vgpr7
+ ; CHECK-NEXT: ; implicit-def: $vgpr0
+ ; CHECK-NEXT: ; implicit-def: $vgpr3
+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v7
+ ; CHECK-NEXT: s_waitcnt vmcnt(30)
+ ; CHECK-NEXT: v_max3_f32 v55, v9, 0, v10
+ ; CHECK-NEXT: v_lshrrev_b32_e32 v9, 10, v0
+ ; CHECK-NEXT: s_waitcnt vmcnt(28)
+ ; CHECK-NEXT: v_max3_f32 v36, v12, 0, v36
+ ; CHECK-NEXT: v_lshlrev_b32_e32 v12, 1, v7
+ ; CHECK-NEXT: s_waitcnt vmcnt(26)
+ ; CHECK-NEXT: v_max3_f32 v41, v41, 0, v42
+ ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 1, v6
+ ; CHECK-NEXT: s_waitcnt vmcnt(24)
+ ; CHECK-NEXT: v_max3_f32 v42, v47, 0, v48
+ ; CHECK-NEXT: s_waitcnt vmcnt(22)
+ ; CHECK-NEXT: v_max3_f32 v47, v55, v11, v51
+ ; CHECK-NEXT: v_mad_u32_u24 v11, 0x90, v7, v56
+ ; CHECK-NEXT: s_waitcnt vmcnt(20)
+ ; CHECK-NEXT: v_max3_f32 v35, v36, v35, v52
+ ; CHECK-NEXT: s_waitcnt vmcnt(18)
+ ; CHECK-NEXT: v_max3_f32 v36, v41, v49, v53
+ ; CHECK-NEXT: s_waitcnt vmcnt(16)
+ ; CHECK-NEXT: v_max3_f32 v41, v42, v50, v54
+ ; CHECK-NEXT: s_waitcnt vmcnt(14)
+ ; CHECK-NEXT: v_max3_f32 v20, v47, v20, v21
+ ; CHECK-NEXT: v_add_nc_u32_e32 v21, 2, v3
+ ; CHECK-NEXT: s_waitcnt vmcnt(12)
+ ; CHECK-NEXT: v_max3_f32 v31, v35, v31, v32
+ ; CHECK-NEXT: s_waitcnt vmcnt(10)
+ ; CHECK-NEXT: v_max3_f32 v32, v36, v37, v38
+ ; CHECK-NEXT: s_waitcnt vmcnt(8)
+ ; CHECK-NEXT: v_max3_f32 v35, v41, v43, v44
+ ; CHECK-NEXT: s_waitcnt vmcnt(6)
+ ; CHECK-NEXT: v_max3_f32 v29, v20, v29, v30
+ ; CHECK-NEXT: v_add_nc_u32_e32 v20, 3, v3
+ ; CHECK-NEXT: s_waitcnt vmcnt(4)
+ ; CHECK-NEXT: v_max3_f32 v30, v31, v33, v34
+ ; CHECK-NEXT: s_waitcnt vmcnt(2)
+ ; CHECK-NEXT: v_max3_f32 v31, v32, v39, v40
+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
+ ; CHECK-NEXT: v_max3_f32 v32, v35, v45, v46
+ ; CHECK-NEXT: ds_store_2addr_b32 v11, v29, v31 offset1:1
+ ; CHECK-NEXT: ds_store_2addr_b32 v11, v30, v32 offset0:18 offset1:19
+ ; CHECK-NEXT: s_endpgm
+ .entry:
+ ret void
+ }
+
+...
+---
+name: _amdgpu_cs_main
+exposesReturnsTwice: false
+tracksRegLiveness: true
+body: |
+ bb.0..entry:
+
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr3 = IMPLICIT_DEF
+ $vgpr6 = IMPLICIT_DEF
+ $vgpr7 = IMPLICIT_DEF
+ $vgpr11 = IMPLICIT_DEF
+ $vgpr12 = IMPLICIT_DEF
+ $vgpr16 = IMPLICIT_DEF
+ $vgpr17 = IMPLICIT_DEF
+ $vgpr19 = IMPLICIT_DEF
+ $vgpr22 = IMPLICIT_DEF
+ $vgpr24 = IMPLICIT_DEF
+ $vgpr25 = IMPLICIT_DEF
+ $vgpr26 = IMPLICIT_DEF
+ $vgpr57 = IMPLICIT_DEF
+ $vgpr58 = IMPLICIT_DEF
+ $vgpr59 = IMPLICIT_DEF
+ $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = IMPLICIT_DEF
+
+ renamable $vgpr15 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr11, implicit $mode, implicit $exec
+ renamable $vgpr28 = nofpexcept V_CVT_U32_F32_e32 killed $vgpr12, implicit $mode, implicit $exec
+ renamable $vgpr20 = IMAGE_LOAD_V1_V3_gfx11 $vgpr57_vgpr58_vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr21 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr29 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr30 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr11 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr9 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr10 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr31 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr32 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr33 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr34 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr35 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr12 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr36 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr37 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr38 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr39 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr40 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr41 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr42 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr43 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr59, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr44 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr24, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr45 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr25, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr46 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr26, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr47 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr19, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr48 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr17, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr49 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr50 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr22, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr51 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr52 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr58, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr53 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr57, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr54 = IMAGE_LOAD_V1_V3_nsa_gfx11 renamable $vgpr15, renamable $vgpr28, renamable $vgpr16, renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, 1, 6, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
+ renamable $vgpr55 = V_MAX3_F32_e64 0, killed $vgpr9, 0, 0, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr9 = V_LSHRREV_B32_e32 10, $vgpr0, implicit $exec
+ renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr12, 0, 0, 0, killed $vgpr36, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr56 = V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec
+ renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr41, 0, 0, 0, killed $vgpr42, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr12 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr7, implicit $exec
+ renamable $vgpr42 = V_MAX3_F32_e64 0, killed $vgpr47, 0, 0, 0, killed $vgpr48, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr10 = nuw nsw V_LSHLREV_B32_e32 1, $vgpr6, implicit $exec
+ V_CMP_EQ_U32_e32 7, $vgpr7, implicit-def $vcc, implicit $exec
+ renamable $vgpr47 = V_MAX3_F32_e64 0, killed $vgpr55, 0, killed $vgpr11, 0, killed $vgpr51, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr35, 0, killed $vgpr52, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr36 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr49, 0, killed $vgpr53, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr41 = V_MAX3_F32_e64 0, killed $vgpr42, 0, killed $vgpr50, 0, killed $vgpr54, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr11 = V_MAD_U32_U24_e64 144, $vgpr7, killed $vgpr56, 0, implicit $exec
+ renamable $vgpr20 = V_MAX3_F32_e64 0, killed $vgpr47, 0, killed $vgpr20, 0, killed $vgpr21, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr31, 0, killed $vgpr32, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr36, 0, killed $vgpr37, 0, killed $vgpr38, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr35 = V_MAX3_F32_e64 0, killed $vgpr41, 0, killed $vgpr43, 0, killed $vgpr44, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr21 = V_ADD_U32_e32 2, $vgpr3, implicit $exec
+ renamable $vgpr29 = V_MAX3_F32_e64 0, killed $vgpr20, 0, killed $vgpr29, 0, killed $vgpr30, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr30 = V_MAX3_F32_e64 0, killed $vgpr31, 0, killed $vgpr33, 0, killed $vgpr34, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr31 = V_MAX3_F32_e64 0, killed $vgpr32, 0, killed $vgpr39, 0, killed $vgpr40, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr32 = V_MAX3_F32_e64 0, killed $vgpr35, 0, killed $vgpr45, 0, killed $vgpr46, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr20 = V_ADD_U32_e32 3, $vgpr3, implicit $exec
+ DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr29, killed renamable $vgpr31, 0, 1, 0, implicit $exec :: (store (s32) into %ir.out0, addrspace 3), (store (s32) into %ir.out1, addrspace 3)
+ DS_WRITE2_B32_gfx9 renamable $vgpr11, killed renamable $vgpr30, killed renamable $vgpr32, 18, 19, 0, implicit $exec :: (store (s32) into %ir.out2, addrspace 3), (store (s32) into %ir.out3, addrspace 3)
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list