[llvm] [AMDGPU][SILoadStoreOptimizer] Try to find common base for L/Ss with 0 offset (PR #71126)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 16:34:33 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jeffrey Byrnes (jrbyrnes)
<details>
<summary>Changes</summary>
By collecting these during promoteConstantOffsetToImm , the coherent AddrMode reasoning is aware of them and we improve addressing codegen in some cases.
---
Patch is 43.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71126.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+21-14)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+201-199)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir (+123)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 17105965471f65b..36c05c72f427b93 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2023,9 +2023,15 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
if (Offset0P)
BaseLo = *Src1;
else {
- if (!(Offset0P = extractConstOffset(*Src1)))
- return;
- BaseLo = *Src0;
+ if ((Offset0P = extractConstOffset(*Src1)))
+ BaseLo = *Src0;
+ }
+
+ // We are unable to find an offset by looking through BaseLo. Try using 0
+ // offset with BaseLo as the base.
+ if (!Offset0P) {
+ Offset0P = 0;
+ BaseLo = Def->getOperand(1);
}
Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
@@ -2034,17 +2040,24 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
if (Src0->isImm())
std::swap(Src0, Src1);
- if (!Src1->isImm())
- return;
+ std::optional<uint64_t> Offset1;
+ if (Src1->isImm()) {
+ Offset1 = Src1->getImm();
+ BaseHi = *Src0;
+ }
- uint64_t Offset1 = Src1->getImm();
- BaseHi = *Src0;
+ // We are unable to find an offset by looking through BaseHi. Try using 0
+ // offset with BaseHi as the base.
+ if (!Offset1) {
+ Offset1 = 0;
+ BaseHi = Def->getOperand(3);
+ }
Addr.Base.LoReg = BaseLo.getReg();
Addr.Base.HiReg = BaseHi.getReg();
Addr.Base.LoSubReg = BaseLo.getSubReg();
Addr.Base.HiSubReg = BaseHi.getSubReg();
- Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+ Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (*Offset1 << 32);
}
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
@@ -2082,12 +2095,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
} else
MAddr = Visited[&MI];
- if (MAddr.Offset == 0) {
- LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
- " constant offsets that can be promoted.\n";);
- return false;
- }
-
LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
<< MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..4bb619209e6a68b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -110,53 +110,53 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v18
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v20
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: s_movk_i32 s1, 0x2000
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
; GFX9-NEXT: s_movk_i32 s0, 0x1000
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX9-NEXT: global_load_dwordx2 v[14:15], v[6:7], off
-; GFX9-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-4096
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:2048
+; GFX9-NEXT: s_movk_i32 s1, 0x3000
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-4096
+; GFX9-NEXT: s_movk_i32 s0, 0x2000
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[14:15], v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[18:19], v[2:3], off offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v7, vcc
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: clmem_read_simplified:
@@ -185,46 +185,45 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v1, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, 0x800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, 0x1800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:-2048
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, 0x2800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off
-; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x3000
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off
+; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off offset:-2048
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v14, 0x3800
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v15, vcc_lo
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_dwordx2 v[14:15], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:-2048
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
-; GFX10-NEXT: global_load_dwordx2 v[18:19], v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[18:19], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v10, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(4)
+; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v12, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v13, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v14, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v15, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v16, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v17, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v18, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v19, v1, vcc_lo
@@ -253,48 +252,51 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0
+; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[10:11], v[6:7], off offset:-4096
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v0, 0x3000
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048
+; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off offset:-4096
+; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048
-; GFX11-NEXT: global_load_b64 v[14:15], v[0:1], off
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
@@ -1148,10 +1150,10 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_movk_i32 s0, 0x1000
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:1024
-; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:2048
-; GFX9-NEXT: global_load_dword v8, v[0:1], off offset:3072
+; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:1024
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:2048
+; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:3072
+; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:-4096
; GFX9-NEXT: global_load_dword v9, v[2:3], off
; GFX9-NEXT: global_load_dword v10, v[2:3], off offset:1024
; GFX9-NEXT: global_load_dword v11, v[2:3], off offset:2048
@@ -1160,10 +1162,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:1024
-; GFX9-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NEXT: v_add_u32_e32 v0, v6, v5
; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: v_add3_u32 v0, v7, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v0, v5, v8
+; GFX9-NEXT: v_add3_u32 v0, v6, v0, v7
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10
; GFX9-NEXT: s_waitcnt vmcnt(2)
@@ -1201,41 +1202,42 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x800
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v9, v[0:1], off offset:1024
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, 0x1000, v0
-; GFX10-NEXT: s_clause 0x4
-; GFX10-NEXT: global_load_dword v9, v[0:1], off
-; GFX10-NEXT: global_load_dword v10, v[0:1], off offset:1024
-; GFX10-NEXT: global_load_dword v11, v[2:3], off offset:1024
-; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:-2048
-; GFX10-NEXT: global_load_dword v13, v[4:5], off
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v10, v[2:3], off offset:-2048
+; GFX10-NEXT: global_load_dword v11, v[2:3], off
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1800, v0
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1800
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v12, v[4:5], off offset:1024
+; GFX10-NEXT: global_load_dword v13, v[6:7], off offset:1024
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v14, v[6:7], off offset:1024
-; GFX10-NEXT: global_load_dword v15, v[2:3], off offset:1024
+; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:-2048
+; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: global_load_dword v2, v[4:5], off offset:-2048
-; GFX10-NEXT: global_load_dword v3, v[4:5], off
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:1024
+; GFX10-NEXT: global_load_dword v2, v[4:5], off offset:1024
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: global_load_dword v14, v[0:1], off offset:1024
; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v10, v9
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v9, v10
; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_add3_u32 v0, v12, v0, v11
+; GFX10-NEXT: v_add3_u32 v0, v11, v0, v12
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_add3_u32 v0, v13, v0, v14
+; GFX10-NEXT: v_add3_u32 v0, v6, v0, v13
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_add3_u32 v0, v2, v0, v15
+; GFX10-NEXT: v_add3_u32 v0, v7, v0, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add3_u32 v0, v3, v0, v6
+; GFX10-NEXT: v_add3_u32 v0, v3, v0, v14
; GFX10-NEXT: global_store_dword v8, v0, s[34:35]
; GFX10-NEXT: s_endpgm
;
@@ -1261,37 +1263,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v7, v[0:1], off
-; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v7, v[0:1], off...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/71126
More information about the llvm-commits
mailing list