[llvm] 86815a1 - AMDGPU/GlobalISel: Permit mapping G_FRAME_INDEX to sgprs (#101325)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 31 22:42:07 PDT 2024
Author: Matt Arsenault
Date: 2024-08-01T09:42:04+04:00
New Revision: 86815a1842d308521f46048bb9ed08e47c0d8357
URL: https://github.com/llvm/llvm-project/commit/86815a1842d308521f46048bb9ed08e47c0d8357
DIFF: https://github.com/llvm/llvm-project/commit/86815a1842d308521f46048bb9ed08e47c0d8357.diff
LOG: AMDGPU/GlobalISel: Permit mapping G_FRAME_INDEX to sgprs (#101325)
eliminateFrameIndex should now properly handle materializing
frame indices in SGPRs, so treat this like the other constant
operand types.
On average this will produce worse code; we need to detect
VGPR uses, and improve SGPR->VGPR frame index folds.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8da8c94b4d665..9a6ba5ac68084 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4060,6 +4060,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT:
case AMDGPU::G_GLOBAL_VALUE:
+ case AMDGPU::G_FRAME_INDEX:
case AMDGPU::G_BLOCK_ADDR:
case AMDGPU::G_READSTEADYCOUNTER:
case AMDGPU::G_READCYCLECOUNTER: {
@@ -4067,13 +4068,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case AMDGPU::G_FRAME_INDEX: {
- // TODO: This should be the same as other constants, but eliminateFrameIndex
- // currently assumes VALU uses.
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- break;
- }
case AMDGPU::G_DYN_STACKALLOC: {
// Result is always uniform, and a wave reduction is needed for the source.
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 48916d8d9b2c5..84378bcb70684 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -10,9 +10,11 @@ define amdgpu_kernel void @stack_write_fi() {
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, s5
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b4b95fdab4ab2..4fdb4082346af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -10,11 +10,13 @@ define amdgpu_ps void @amdgpu_ps() {
; MESA-LABEL: amdgpu_ps:
; MESA: ; %bb.0:
; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT: s_mov_b64 s[0:1], src_private_base
; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT: v_mov_b32_e32 v0, 0
-; MESA-NEXT: v_mov_b32_e32 v1, s1
+; MESA-NEXT: s_mov_b32 s0, 0
+; MESA-NEXT: s_mov_b64 s[2:3], src_private_base
+; MESA-NEXT: s_mov_b32 s1, s3
+; MESA-NEXT: v_mov_b32_e32 v0, s0
; MESA-NEXT: v_mov_b32_e32 v2, 0
+; MESA-NEXT: v_mov_b32_e32 v1, s1
; MESA-NEXT: flat_store_dword v[0:1], v2
; MESA-NEXT: s_waitcnt vmcnt(0)
; MESA-NEXT: s_endpgm
@@ -24,13 +26,15 @@ define amdgpu_ps void @amdgpu_ps() {
; PAL-NEXT: s_getpc_b64 s[2:3]
; PAL-NEXT: s_mov_b32 s2, s0
; PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; PAL-NEXT: v_mov_b32_e32 v0, 0
; PAL-NEXT: v_mov_b32_e32 v2, 0
; PAL-NEXT: s_waitcnt lgkmcnt(0)
; PAL-NEXT: s_and_b32 s3, s3, 0xffff
; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT: s_mov_b64 s[0:1], src_private_base
; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT: s_mov_b32 s0, 0
+; PAL-NEXT: s_mov_b64 s[2:3], src_private_base
+; PAL-NEXT: s_mov_b32 s1, s3
+; PAL-NEXT: v_mov_b32_e32 v0, s0
; PAL-NEXT: v_mov_b32_e32 v1, s1
; PAL-NEXT: flat_store_dword v[0:1], v2
; PAL-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a5e4151bf3695..f4fd803c8dda8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT: s_add_i32 s0, s0, 0
+; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_add_i32 s0, s0, 0
+; GFX11-NEXT: s_add_i32 s1, s1, 0
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: store_load_sindex_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: v_mov_b32_e32 v0, 15
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s1, s0, 2
-; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS
+; GFX12-NEXT: s_lshl_b32 s1, s1, 2
+; GFX12-NEXT: s_add_co_i32 s0, s0, 0
+; GFX12-NEXT: s_add_co_i32 s1, s1, 0
+; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: scratch_load_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: scratch_load_dword v0, v0, off offset:256 sc0 sc1
+; GFX940-NEXT: s_addk_i32 s0, 0x100
+; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_small_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_addk_i32 s0, 0x100
+; GFX11-NEXT: s_addk_i32 s1, 0x100
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:256 glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: v_mov_b32_e32 v0, 15
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s1, s0, 2
-; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT: s_lshl_b32 s1, s1, 2
+; GFX12-NEXT: s_addk_co_i32 s0, 0x100
+; GFX12-NEXT: s_addk_co_i32 s1, 0x100
+; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: s_movk_i32 s0, 0x4004
-; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
+; GFX940-NEXT: s_addk_i32 s0, 0x4004
+; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_large_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_addk_i32 s0, 0x4004
+; GFX11-NEXT: s_addk_i32 s1, 0x4004
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_movk_i32 s0, 0x4004
-; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: v_mov_b32_e32 v0, 15
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b32 s1, s0, 2
-; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_and_b32 s1, s0, 15
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT: s_lshl_b32 s1, s1, 2
+; GFX12-NEXT: s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT: s_addk_co_i32 s1, 0x4000
+; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm
bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX940-LABEL: store_load_large_imm_offset_kernel:
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_mov_b32_e32 v0, 13
+; GFX940-NEXT: s_movk_i32 s0, 0x3e80
; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT: v_mov_b32_e32 v1, 15
-; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, 15
+; GFX940-NEXT: s_add_i32 s0, s0, 4
+; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
+; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc
+; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, 13
+; GFX940-NEXT: s_movk_i32 s0, 0x3e80
+; GFX940-NEXT: s_add_i32 s1, s32, 4
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT: v_mov_b32_e32 v1, 15
-; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, 15
+; GFX940-NEXT: s_add_i32 s0, s0, s1
+; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
+; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_load_large_imm_offset_foo:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_add_i32 s1, s32, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc
+; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index db944b98a3013..4fcde0f2fc7cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -11,12 +11,11 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10
; GCN-NEXT: s_add_u32 s0, s0, s13
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
-; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
@@ -143,16 +142,17 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN-NEXT: v_mov_b32_e32 v0, s48
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240
; GCN-NEXT: v_mov_b32_e32 v0, s49
+; GCN-NEXT: s_and_b32 s4, s25, 63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244
; GCN-NEXT: v_mov_b32_e32 v0, s50
-; GCN-NEXT: s_and_b32 s4, s25, 63
+; GCN-NEXT: s_lshl_b32 s4, s4, 2
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248
; GCN-NEXT: v_mov_b32_e32 v0, s51
-; GCN-NEXT: s_lshl_b32 s4, s4, 2
+; GCN-NEXT: s_add_u32 s4, 0, s4
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252
-; GCN-NEXT: v_add_u32_e32 v0, s4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s24
-; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v0, s24
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
index 14f69c301ec38..76994c5cccf5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
@@ -2,22 +2,45 @@
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
---- |
- target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
- define void @test_frame_index_p5() {
- %ptr0 = alloca i32, addrspace(5)
- ret void
- }
-...
---
name: test_frame_index_p5
legalized: true
stack:
- - { id: 0, name: ptr0, offset: 0, size: 4, alignment: 4 }
+ - { id: 0, offset: 0, size: 4, alignment: 4 }
body: |
bb.0:
; CHECK-LABEL: name: test_frame_index_p5
- ; CHECK: [[FRAME_INDEX:%[0-9]+]]:vgpr(p5) = G_FRAME_INDEX %stack.0.ptr0
- %0:_(p5) = G_FRAME_INDEX %stack.0.ptr0
+ ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+ %0:_(p5) = G_FRAME_INDEX %stack.0
+
+...
+
+---
+name: test_frame_index_p5_sgpr_use
+legalized: true
+stack:
+ - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_frame_index_p5_sgpr_use
+ ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: $sgpr0 = COPY [[FRAME_INDEX]](p5)
+ %0:_(p5) = G_FRAME_INDEX %stack.0
+ $sgpr0 = COPY %0
+
+...
+
+---
+name: test_frame_index_p5_vgpr_use
+legalized: true
+stack:
+ - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_frame_index_p5_vgpr_use
+ ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5)
+ %0:_(p5) = G_FRAME_INDEX %stack.0
+ $vgpr0 = COPY %0
...
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index e2b4865410db8..3216e71e6221a 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -96,16 +96,29 @@ define void @private_alloca_to_flat(ptr %ptr) {
; OPT-NEXT: store volatile i32 7, ptr [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; ASM-LABEL: private_alloca_to_flat:
-; ASM: ; %bb.0:
-; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; ASM-NEXT: v_mov_b32_e32 v1, s5
-; ASM-NEXT: v_mov_b32_e32 v2, 7
-; ASM-NEXT: flat_store_dword v[0:1], v2
-; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; ASM-NEXT: s_setpc_b64 s[30:31]
+; DAGISEL-ASM-LABEL: private_alloca_to_flat:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-ASM-LABEL: private_alloca_to_flat:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6
+; GISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base
+; GISEL-ASM-NEXT: s_mov_b32 s5, s7
+; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i8, addrspace(5)
%x = addrspacecast ptr addrspace(5) %alloca to ptr
store volatile i32 7, ptr %x
@@ -224,8 +237,9 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-ASM-NEXT: s_lshr_b32 s6, s32, 6
; GISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s6
; GISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GISEL-ASM-NEXT: ; %bb.1: ; %then
; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 087d38ce7b004..89da9b8e75bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -37,17 +37,17 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX940-GISEL-LABEL: soff1_voff1:
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4
@@ -78,12 +78,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
@@ -114,8 +116,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -166,13 +169,13 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -208,10 +211,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -246,10 +251,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -300,13 +307,13 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -342,10 +349,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -380,10 +389,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -434,18 +445,18 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX940-GISEL-LABEL: soff2_voff1:
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4
@@ -478,14 +489,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
@@ -519,8 +531,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -571,14 +584,14 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -615,12 +628,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -656,12 +670,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -712,14 +727,14 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -756,12 +771,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -797,12 +813,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -853,18 +870,18 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX940-GISEL-LABEL: soff4_voff1:
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4
@@ -897,14 +914,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
@@ -938,8 +956,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -990,14 +1009,14 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -1034,12 +1053,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -1075,12 +1095,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -1130,14 +1151,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX940-GISEL: ; %bb.0: ; %bb
; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24
; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1
; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT: scratch_store_byte v2, v1, off sc0 sc1
; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0
; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2
@@ -1174,12 +1195,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-GISEL: ; %bb.0: ; %bb
; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0
@@ -1215,12 +1237,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
More information about the llvm-commits
mailing list